From a8c88e0a0c9e49ee4ece9a9a3f0d6852e7b0d0b4 Mon Sep 17 00:00:00 2001 From: Mxrcon <48180517+Mxrcon@users.noreply.github.com> Date: Thu, 18 Feb 2021 15:49:18 -0300 Subject: [PATCH] Add DSL2 modules --- .gitignore | 1 + main.nf | 116 +- modules/ariba/ariba_analysis/README.md | 16 + .../ariba/ariba_analysis/ariba_analysis.nf | 51 + .../ariba_analysis/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../ariba/ariba_analysis/bin/check-fastqs.py | 109 ++ .../ariba/ariba_analysis/bin/check-staging.py | 59 + .../ariba_analysis/bin/cleanup-coverage.py | 75 + .../ariba/ariba_analysis/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../ariba_analysis/bin/mask-consensus.py | 173 +++ .../ariba_analysis/bin/merge-blast-json.py | 49 + .../ariba/ariba_analysis/bin/mlst-blast.py | 185 +++ .../ariba_analysis/bin/select-references.py | 159 ++ .../ariba_analysis/bin/split-coverages.py | 69 + .../ariba/ariba_analysis/bin/update-conda.sh | 67 + .../ariba/ariba_analysis/bin/update-docker.sh | 70 + .../ariba/ariba_analysis/bin/update-tools.sh | 58 + .../ariba_analysis/bin/update-version.sh | 89 ++ modules/ariba/ariba_analysis/nextflow.config | 40 + .../templates/ariba_analysis.sh | 61 + modules/ariba/ariba_analysis/test_params.yaml | 68 + modules/blast/blast_genes/README.md | 17 + .../blast/blast_genes/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + modules/blast/blast_genes/bin/check-fastqs.py | 109 ++ .../blast/blast_genes/bin/check-staging.py | 59 + .../blast/blast_genes/bin/cleanup-coverage.py | 75 + modules/blast/blast_genes/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../blast_genes/bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../blast_genes/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../blast_genes/bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../blast/blast_genes/bin/mask-consensus.py | 173 +++ .../blast/blast_genes/bin/merge-blast-json.py | 49 + modules/blast/blast_genes/bin/mlst-blast.py | 185 +++ .../blast_genes/bin/select-references.py | 159 ++ .../blast/blast_genes/bin/split-coverages.py | 69 + modules/blast/blast_genes/bin/update-conda.sh | 67 + .../blast/blast_genes/bin/update-docker.sh | 70 + modules/blast/blast_genes/bin/update-tools.sh | 58 + .../blast/blast_genes/bin/update-version.sh | 89 ++ modules/blast/blast_genes/blast_genes.nf | 50 + modules/blast/blast_genes/nextflow.config | 46 + .../blast_genes/templates/blast_genes.sh | 45 + modules/blast/blast_genes/test_params.yaml | 41 + modules/blast/blast_primers/README.md | 17 + .../blast_primers/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../blast/blast_primers/bin/check-fastqs.py | 109 ++ .../blast/blast_primers/bin/check-staging.py | 59 + .../blast_primers/bin/cleanup-coverage.py | 75 + .../blast/blast_primers/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../blast/blast_primers/bin/mask-consensus.py | 173 +++ .../blast_primers/bin/merge-blast-json.py | 49 + modules/blast/blast_primers/bin/mlst-blast.py | 185 +++ .../blast_primers/bin/select-references.py | 159 ++ .../blast_primers/bin/split-coverages.py | 69 + .../blast/blast_primers/bin/update-conda.sh | 67 + .../blast/blast_primers/bin/update-docker.sh | 70 + .../blast/blast_primers/bin/update-tools.sh | 58 + .../blast/blast_primers/bin/update-version.sh | 89 ++ modules/blast/blast_primers/blast_primers.nf | 50 + modules/blast/blast_primers/nextflow.config | 47 + .../blast_primers/templates/blast_primers.sh | 46 + modules/blast/blast_primers/test_params.yaml | 42 + modules/blast/blast_proteins/README.md | 17 + .../blast_proteins/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../blast/blast_proteins/bin/check-fastqs.py | 109 ++ .../blast/blast_proteins/bin/check-staging.py | 59 + .../blast_proteins/bin/cleanup-coverage.py | 75 + .../blast/blast_proteins/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../blast_proteins/bin/mask-consensus.py | 173 +++ .../blast_proteins/bin/merge-blast-json.py | 49 + .../blast/blast_proteins/bin/mlst-blast.py | 185 +++ .../blast_proteins/bin/select-references.py | 159 ++ .../blast_proteins/bin/split-coverages.py | 69 + .../blast/blast_proteins/bin/update-conda.sh | 67 + .../blast/blast_proteins/bin/update-docker.sh | 70 + .../blast/blast_proteins/bin/update-tools.sh | 58 + .../blast_proteins/bin/update-version.sh | 89 ++ .../blast/blast_proteins/blast_proteins.nf | 51 + modules/blast/blast_proteins/nextflow.config | 46 + .../templates/blast_proteins.sh | 44 + modules/blast/blast_proteins/test_params.yaml | 41 + modules/blast/make_blastdb/README.md | 17 + .../make_blastdb/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../blast/make_blastdb/bin/check-fastqs.py | 109 ++ .../blast/make_blastdb/bin/check-staging.py | 59 + .../make_blastdb/bin/cleanup-coverage.py | 75 + modules/blast/make_blastdb/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../make_blastdb/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../blast/make_blastdb/bin/mask-consensus.py | 173 +++ .../make_blastdb/bin/merge-blast-json.py | 49 + modules/blast/make_blastdb/bin/mlst-blast.py | 185 +++ .../make_blastdb/bin/select-references.py | 159 ++ .../blast/make_blastdb/bin/split-coverages.py | 69 + .../blast/make_blastdb/bin/update-conda.sh | 67 + .../blast/make_blastdb/bin/update-docker.sh | 70 + .../blast/make_blastdb/bin/update-tools.sh | 58 + .../blast/make_blastdb/bin/update-version.sh | 89 ++ modules/blast/make_blastdb/make_blastdb.nf | 43 + modules/blast/make_blastdb/nextflow.config | 46 + .../make_blastdb/templates/make_blastdb.sh | 32 + modules/blast/make_blastdb/test_params.yaml | 30 + modules/blast/plasmid_blast/README.md | 17 + .../plasmid_blast/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../blast/plasmid_blast/bin/check-fastqs.py | 109 ++ .../blast/plasmid_blast/bin/check-staging.py | 59 + .../plasmid_blast/bin/cleanup-coverage.py | 75 + .../blast/plasmid_blast/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../blast/plasmid_blast/bin/mask-consensus.py | 173 +++ .../plasmid_blast/bin/merge-blast-json.py | 49 + modules/blast/plasmid_blast/bin/mlst-blast.py | 185 +++ .../plasmid_blast/bin/select-references.py | 159 ++ .../plasmid_blast/bin/split-coverages.py | 69 + .../blast/plasmid_blast/bin/update-conda.sh | 67 + .../blast/plasmid_blast/bin/update-docker.sh | 70 + .../blast/plasmid_blast/bin/update-tools.sh | 58 + .../blast/plasmid_blast/bin/update-version.sh | 89 ++ modules/blast/plasmid_blast/nextflow.config | 47 + modules/blast/plasmid_blast/plasmid_blast.nf | 51 + .../plasmid_blast/templates/plasmid_blast.sh | 51 + modules/blast/plasmid_blast/test_params.yaml | 47 + modules/bwa/mapping_query/README.md | 17 + .../bwa/mapping_query/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + modules/bwa/mapping_query/bin/check-fastqs.py | 109 ++ .../bwa/mapping_query/bin/check-staging.py | 59 + .../bwa/mapping_query/bin/cleanup-coverage.py | 75 + modules/bwa/mapping_query/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../bwa/mapping_query/bin/mask-consensus.py | 173 +++ .../bwa/mapping_query/bin/merge-blast-json.py | 49 + modules/bwa/mapping_query/bin/mlst-blast.py | 185 +++ .../mapping_query/bin/select-references.py | 159 ++ .../bwa/mapping_query/bin/split-coverages.py | 69 + modules/bwa/mapping_query/bin/update-conda.sh | 67 + .../bwa/mapping_query/bin/update-docker.sh | 70 + modules/bwa/mapping_query/bin/update-tools.sh | 58 + .../bwa/mapping_query/bin/update-version.sh | 89 ++ modules/bwa/mapping_query/mapping_query.nf | 53 + modules/bwa/mapping_query/nextflow.config | 48 + .../mapping_query/templates/mapping_query.sh | 65 + modules/bwa/mapping_query/test_params.yaml | 53 + .../mash/antimicrobial_resistance/README.md | 17 + .../antimicrobial_resistance.nf | 57 + .../bin/check-staging.py | 59 + .../antimicrobial_resistance/nextflow.config | 47 + .../templates/antimicrobial_resistance.sh | 61 + .../antimicrobial_resistance/test_params.yaml | 56 + modules/mash/estimate_genome_size/README.md | 17 + .../bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../estimate_genome_size/bin/check-fastqs.py | 109 ++ .../estimate_genome_size/bin/check-staging.py | 59 + .../bin/cleanup-coverage.py | 75 + .../estimate_genome_size/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../bin/mask-consensus.py | 173 +++ .../bin/merge-blast-json.py | 49 + .../estimate_genome_size/bin/mlst-blast.py | 185 +++ .../bin/select-references.py | 159 ++ .../bin/split-coverages.py | 69 + .../estimate_genome_size/bin/update-conda.sh | 67 + .../estimate_genome_size/bin/update-docker.sh | 70 + .../estimate_genome_size/bin/update-tools.sh | 58 + .../bin/update-version.sh | 89 ++ .../estimate_genome_size.nf | 50 + .../mash/estimate_genome_size/nextflow.config | 49 + .../templates/estimate_genome_size.sh | 115 ++ .../estimate_genome_size/test_params.yaml | 38 + .../test:estimate_genome_size.sh | 115 ++ modules/mccortex/count_31mers/README.md | 17 + .../count_31mers/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../mccortex/count_31mers/bin/check-fastqs.py | 109 ++ .../count_31mers/bin/check-staging.py | 59 + .../count_31mers/bin/cleanup-coverage.py | 75 + .../mccortex/count_31mers/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../count_31mers/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../count_31mers/bin/mask-consensus.py | 173 +++ .../count_31mers/bin/merge-blast-json.py | 49 + .../mccortex/count_31mers/bin/mlst-blast.py | 185 +++ .../count_31mers/bin/select-references.py | 159 ++ .../count_31mers/bin/split-coverages.py | 69 + .../mccortex/count_31mers/bin/update-conda.sh | 67 + .../count_31mers/bin/update-docker.sh | 70 + .../mccortex/count_31mers/bin/update-tools.sh | 58 + .../count_31mers/bin/update-version.sh | 89 ++ modules/mccortex/count_31mers/count_31mers.nf | 41 + modules/mccortex/count_31mers/nextflow.config | 48 + .../count_31mers/templates/count_31mers.sh | 43 + .../mccortex/count_31mers/test_params.yaml | 35 + modules/minmer/minmer_query/README.md | 17 + .../minmer_query/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../minmer/minmer_query/bin/check-fastqs.py | 109 ++ .../minmer/minmer_query/bin/check-staging.py | 59 + .../minmer_query/bin/cleanup-coverage.py | 75 + .../minmer/minmer_query/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../minmer_query/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../minmer/minmer_query/bin/mask-consensus.py | 173 +++ .../minmer_query/bin/merge-blast-json.py | 49 + modules/minmer/minmer_query/bin/mlst-blast.py | 185 +++ .../minmer_query/bin/select-references.py | 159 ++ .../minmer_query/bin/split-coverages.py | 69 + .../minmer/minmer_query/bin/update-conda.sh | 67 + .../minmer/minmer_query/bin/update-docker.sh | 70 + .../minmer/minmer_query/bin/update-tools.sh | 58 + .../minmer/minmer_query/bin/update-version.sh | 89 ++ modules/minmer/minmer_query/minmer_query.nf | 52 + modules/minmer/minmer_query/nextflow.config | 47 + .../minmer_query/templates/minmer_query.sh | 63 + modules/minmer/minmer_query/test_params.yaml | 50 + modules/minmer/minmer_sketch/README.md | 17 + .../minmer_sketch/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../minmer/minmer_sketch/bin/check-fastqs.py | 109 ++ .../minmer/minmer_sketch/bin/check-staging.py | 59 + .../minmer_sketch/bin/cleanup-coverage.py | 75 + .../minmer/minmer_sketch/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../minmer_sketch/bin/mask-consensus.py | 173 +++ .../minmer_sketch/bin/merge-blast-json.py | 49 + .../minmer/minmer_sketch/bin/mlst-blast.py | 185 +++ .../minmer_sketch/bin/select-references.py | 159 ++ .../minmer_sketch/bin/split-coverages.py | 69 + .../minmer/minmer_sketch/bin/update-conda.sh | 67 + .../minmer/minmer_sketch/bin/update-docker.sh | 70 + .../minmer/minmer_sketch/bin/update-tools.sh | 58 + .../minmer_sketch/bin/update-version.sh | 89 ++ modules/minmer/minmer_sketch/minmer_sketch.nf | 50 + modules/minmer/minmer_sketch/nextflow.config | 48 + .../minmer_sketch/templates/minmer_sketch.sh | 57 + modules/minmer/minmer_sketch/test_params.yaml | 32 + modules/prokka/annotate_genome/README.md | 17 + .../prokka/annotate_genome/annotate_genome.nf | 98 ++ .../annotate_genome/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../annotate_genome/bin/check-fastqs.py | 109 ++ .../annotate_genome/bin/check-staging.py | 59 + .../annotate_genome/bin/cleanup-coverage.py | 75 + .../prokka/annotate_genome/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../annotate_genome/bin/mask-consensus.py | 173 +++ .../annotate_genome/bin/merge-blast-json.py | 49 + .../prokka/annotate_genome/bin/mlst-blast.py | 185 +++ .../annotate_genome/bin/select-references.py | 159 ++ .../annotate_genome/bin/split-coverages.py | 69 + .../annotate_genome/bin/update-conda.sh | 67 + .../annotate_genome/bin/update-docker.sh | 70 + .../annotate_genome/bin/update-tools.sh | 58 + .../annotate_genome/bin/update-version.sh | 89 ++ .../prokka/annotate_genome/nextflow.config | 48 + .../templates/annotate_genome.sh | 72 + .../prokka/annotate_genome/test_params.yaml | 87 ++ modules/shovill/assemble_genome/README.md | 18 + .../assemble_genome/assemble_genome.nf | 70 + .../assemble_genome/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../assemble_genome/bin/check-fastqs.py | 109 ++ .../assemble_genome/bin/check-staging.py | 59 + .../assemble_genome/bin/cleanup-coverage.py | 75 + .../assemble_genome/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../assemble_genome/bin/mask-consensus.py | 173 +++ .../assemble_genome/bin/merge-blast-json.py | 49 + .../shovill/assemble_genome/bin/mlst-blast.py | 185 +++ .../assemble_genome/bin/select-references.py | 159 ++ .../assemble_genome/bin/split-coverages.py | 69 + .../assemble_genome/bin/update-conda.sh | 67 + .../assemble_genome/bin/update-docker.sh | 70 + .../assemble_genome/bin/update-tools.sh | 58 + .../assemble_genome/bin/update-version.sh | 89 ++ .../shovill/assemble_genome/nextflow.config | 49 + .../templates/assemble_genome.sh | 159 ++ .../shovill/assemble_genome/test_params.yaml | 95 ++ .../utilities/download_references/README.md | 18 + .../bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../download_references/bin/check-fastqs.py | 109 ++ .../download_references/bin/check-staging.py | 59 + .../bin/cleanup-coverage.py | 75 + .../download_references/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../download_references/bin/mask-consensus.py | 173 +++ .../bin/merge-blast-json.py | 49 + .../download_references/bin/mlst-blast.py | 185 +++ .../bin/select-references.py | 159 ++ .../bin/split-coverages.py | 69 + .../download_references/bin/update-conda.sh | 67 + .../download_references/bin/update-docker.sh | 70 + .../download_references/bin/update-tools.sh | 58 + .../download_references/bin/update-version.sh | 89 ++ .../download_references.nf | 62 + .../download_references/nextflow.config | 49 + .../templates/download_references.sh | 84 ++ .../download_references/test_params.yaml | 47 + modules/utilities/fastq_status/README.md | 17 + .../fastq_status/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../fastq_status/bin/check-fastqs.py | 109 ++ .../fastq_status/bin/check-staging.py | 59 + .../fastq_status/bin/cleanup-coverage.py | 75 + .../utilities/fastq_status/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../fastq_status/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../fastq_status/bin/mask-consensus.py | 173 +++ .../fastq_status/bin/merge-blast-json.py | 49 + .../utilities/fastq_status/bin/mlst-blast.py | 185 +++ .../fastq_status/bin/select-references.py | 159 ++ .../fastq_status/bin/split-coverages.py | 69 + .../fastq_status/bin/update-conda.sh | 67 + .../fastq_status/bin/update-docker.sh | 70 + .../fastq_status/bin/update-tools.sh | 58 + .../fastq_status/bin/update-version.sh | 89 ++ .../utilities/fastq_status/fastq_status.nf | 47 + .../utilities/fastq_status/nextflow.config | 49 + .../fastq_status/templates/fastq_status.sh | 80 + .../utilities/fastq_status/test_params.yaml | 62 + modules/utilities/gather_fastqs/README.md | 17 + .../gather_fastqs/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../gather_fastqs/bin/check-fastqs.py | 109 ++ .../gather_fastqs/bin/check-staging.py | 59 + .../gather_fastqs/bin/cleanup-coverage.py | 75 + .../gather_fastqs/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../gather_fastqs/bin/mask-consensus.py | 173 +++ .../gather_fastqs/bin/merge-blast-json.py | 49 + .../utilities/gather_fastqs/bin/mlst-blast.py | 185 +++ .../gather_fastqs/bin/select-references.py | 159 ++ .../gather_fastqs/bin/split-coverages.py | 69 + .../gather_fastqs/bin/update-conda.sh | 67 + .../gather_fastqs/bin/update-docker.sh | 70 + .../gather_fastqs/bin/update-tools.sh | 58 + .../gather_fastqs/bin/update-version.sh | 89 ++ .../utilities/gather_fastqs/gather_fastqs.nf | 88 ++ .../utilities/gather_fastqs/nextflow.config | 48 + .../gather_fastqs/templates/gather_fastqs.sh | 174 +++ .../utilities/gather_fastqs/test_params.yaml | 54 + .../quality_control/assembly_qc/README.md | 17 + .../assembly_qc/assembly_qc.nf | 48 + .../assembly_qc/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../assembly_qc/bin/check-fastqs.py | 109 ++ .../assembly_qc/bin/check-staging.py | 59 + .../assembly_qc/bin/cleanup-coverage.py | 75 + .../assembly_qc/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../assembly_qc/bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../assembly_qc/bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../assembly_qc/bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../assembly_qc/bin/mask-consensus.py | 173 +++ .../assembly_qc/bin/merge-blast-json.py | 49 + .../assembly_qc/bin/mlst-blast.py | 185 +++ .../assembly_qc/bin/select-references.py | 159 ++ .../assembly_qc/bin/split-coverages.py | 69 + .../assembly_qc/bin/update-conda.sh | 67 + .../assembly_qc/bin/update-docker.sh | 70 + .../assembly_qc/bin/update-tools.sh | 58 + .../assembly_qc/bin/update-version.sh | 89 ++ .../assembly_qc/nextflow.config | 52 + .../assembly_qc/templates/assembly_qc.sh | 72 + .../assembly_qc/test_params.yaml | 83 ++ .../qc_final_summary/README.md | 17 + .../qc_final_summary/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../qc_final_summary/bin/check-fastqs.py | 109 ++ .../qc_final_summary/bin/check-staging.py | 59 + .../qc_final_summary/bin/cleanup-coverage.py | 75 + .../qc_final_summary/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../qc_final_summary/bin/mask-consensus.py | 173 +++ .../qc_final_summary/bin/merge-blast-json.py | 49 + .../qc_final_summary/bin/mlst-blast.py | 185 +++ .../qc_final_summary/bin/select-references.py | 159 ++ .../qc_final_summary/bin/split-coverages.py | 69 + .../qc_final_summary/bin/update-conda.sh | 67 + .../qc_final_summary/bin/update-docker.sh | 70 + .../qc_final_summary/bin/update-tools.sh | 58 + .../qc_final_summary/bin/update-version.sh | 89 ++ .../qc_final_summary/nextflow.config | 48 + .../qc_final_summary/qc_final_summary.nf | 44 + .../templates/qc_final_summary.sh | 51 + .../qc_final_summary/test_params.yaml | 113 ++ .../qc_original_summary/README.md | 17 + .../bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../qc_original_summary/bin/check-fastqs.py | 109 ++ .../qc_original_summary/bin/check-staging.py | 59 + .../bin/cleanup-coverage.py | 75 + .../qc_original_summary/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../qc_original_summary/bin/mask-consensus.py | 173 +++ .../bin/merge-blast-json.py | 49 + .../qc_original_summary/bin/mlst-blast.py | 185 +++ .../bin/select-references.py | 159 ++ .../bin/split-coverages.py | 69 + .../qc_original_summary/bin/update-conda.sh | 67 + .../qc_original_summary/bin/update-docker.sh | 70 + .../qc_original_summary/bin/update-tools.sh | 58 + .../qc_original_summary/bin/update-version.sh | 89 ++ .../qc_original_summary/nextflow.config | 47 + .../qc_original_summary.nf | 47 + .../templates/qc_original_summary.sh | 51 + .../qc_original_summary/test_params.yaml | 113 ++ .../quality_control/qc_reads/README.md | 14 + .../qc_reads/bin/build-containers.sh | 95 ++ .../qc_reads/bin/check-assembly-accession.py | 79 + .../qc_reads/bin/check-fastqs.py | 109 ++ .../qc_reads/bin/check-staging.py | 59 + .../qc_reads/bin/cleanup-coverage.py | 75 + .../qc_reads/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../qc_reads/bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../qc_reads/bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../qc_reads/bin/helpers/bactopia-prepare.py | 272 ++++ .../qc_reads/bin/helpers/bactopia-pull.py | 223 +++ .../qc_reads/bin/helpers/bactopia-search.py | 385 +++++ .../qc_reads/bin/helpers/bactopia-summary.py | 63 + .../qc_reads/bin/helpers/bactopia-tools.py | 202 +++ .../qc_reads/bin/helpers/bactopia-versions.py | 106 ++ .../qc_reads/bin/mask-consensus.py | 173 +++ .../qc_reads/bin/merge-blast-json.py | 49 + .../qc_reads/bin/mlst-blast.py | 185 +++ .../qc_reads/bin/select-references.py | 159 ++ .../qc_reads/bin/split-coverages.py | 69 + .../qc_reads/bin/update-conda.sh | 67 + .../qc_reads/bin/update-docker.sh | 70 + .../qc_reads/bin/update-tools.sh | 58 + .../qc_reads/bin/update-version.sh | 89 ++ .../quality_control/qc_reads/nextflow.config | 50 + .../quality_control/qc_reads/qc_reads.nf | 65 + .../qc_reads/templates/qc_reads.sh | 229 +++ .../quality_control/qc_reads/test_params.yaml | 119 ++ modules/utilities/sequence_type/README.md | 16 + .../sequence_type/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../sequence_type/bin/check-fastqs.py | 109 ++ .../sequence_type/bin/check-staging.py | 59 + .../sequence_type/bin/cleanup-coverage.py | 75 + .../sequence_type/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../sequence_type/bin/mask-consensus.py | 173 +++ .../sequence_type/bin/merge-blast-json.py | 49 + .../utilities/sequence_type/bin/mlst-blast.py | 185 +++ .../sequence_type/bin/select-references.py | 159 ++ .../sequence_type/bin/split-coverages.py | 69 + .../sequence_type/bin/update-conda.sh | 67 + .../sequence_type/bin/update-docker.sh | 70 + .../sequence_type/bin/update-tools.sh | 58 + .../sequence_type/bin/update-version.sh | 89 ++ .../utilities/sequence_type/nextflow.config | 48 + .../utilities/sequence_type/sequence_type.nf | 60 + .../sequence_type/templates/sequence_type.sh | 60 + .../utilities/sequence_type/test_params.yaml | 71 + .../variant_calling/call_variants/README.md | 17 + .../call_variants/bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../call_variants/bin/check-fastqs.py | 109 ++ .../call_variants/bin/check-staging.py | 59 + .../call_variants/bin/cleanup-coverage.py | 75 + .../call_variants/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../call_variants/bin/mask-consensus.py | 173 +++ .../call_variants/bin/merge-blast-json.py | 49 + .../call_variants/bin/mlst-blast.py | 185 +++ .../call_variants/bin/select-references.py | 159 ++ .../call_variants/bin/split-coverages.py | 69 + .../call_variants/bin/update-conda.sh | 67 + .../call_variants/bin/update-docker.sh | 70 + .../call_variants/bin/update-tools.sh | 58 + .../call_variants/bin/update-version.sh | 89 ++ .../call_variants/call_variants.nf | 56 + .../call_variants/nextflow.config | 49 + .../call_variants/templates/call_variants.sh | 76 + .../call_variants/test_params.yaml | 59 + .../call_variants_auto/README.md | 17 + .../bin/build-containers.sh | 95 ++ .../bin/check-assembly-accession.py | 79 + .../call_variants_auto/bin/check-fastqs.py | 109 ++ .../call_variants_auto/bin/check-staging.py | 59 + .../bin/cleanup-coverage.py | 75 + .../call_variants_auto/bin/create-tool.sh | 35 + .../bin/gh-actions/free-disk-space.sh | 50 + .../bin/gh-actions/setup-bactopia-env.sh | 66 + .../bin/gh-actions/setup-docker-builds.py | 249 ++++ .../bin/helpers/bactopia-build.py | 239 +++ .../bin/helpers/bactopia-citations.py | 69 + .../bin/helpers/bactopia-datasets.py | 1293 +++++++++++++++++ .../bin/helpers/bactopia-prepare.py | 272 ++++ .../bin/helpers/bactopia-pull.py | 223 +++ .../bin/helpers/bactopia-search.py | 385 +++++ .../bin/helpers/bactopia-summary.py | 63 + .../bin/helpers/bactopia-tools.py | 202 +++ .../bin/helpers/bactopia-versions.py | 106 ++ .../call_variants_auto/bin/mask-consensus.py | 173 +++ .../bin/merge-blast-json.py | 49 + .../call_variants_auto/bin/mlst-blast.py | 185 +++ .../bin/select-references.py | 159 ++ .../call_variants_auto/bin/split-coverages.py | 69 + .../call_variants_auto/bin/update-conda.sh | 67 + .../call_variants_auto/bin/update-docker.sh | 70 + .../call_variants_auto/bin/update-tools.sh | 58 + .../call_variants_auto/bin/update-version.sh | 89 ++ .../call_variants_auto/call_variants_auto.nf | 52 + .../call_variants_auto/nextflow.config | 49 + .../templates/call_variants_auto.sh | 77 + .../call_variants_auto/test_params.yaml | 56 + nextflow.config | 3 +- 746 files changed, 112005 insertions(+), 59 deletions(-) mode change 100755 => 100644 main.nf create mode 100644 modules/ariba/ariba_analysis/README.md create mode 100644 modules/ariba/ariba_analysis/ariba_analysis.nf create mode 100755 modules/ariba/ariba_analysis/bin/build-containers.sh create mode 100755 modules/ariba/ariba_analysis/bin/check-assembly-accession.py create mode 100755 modules/ariba/ariba_analysis/bin/check-fastqs.py create mode 100755 modules/ariba/ariba_analysis/bin/check-staging.py create mode 100755 modules/ariba/ariba_analysis/bin/cleanup-coverage.py create mode 100755 modules/ariba/ariba_analysis/bin/create-tool.sh create mode 100755 modules/ariba/ariba_analysis/bin/gh-actions/free-disk-space.sh create mode 100755 modules/ariba/ariba_analysis/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/ariba/ariba_analysis/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-build.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-citations.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-datasets.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-prepare.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-pull.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-search.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-summary.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-tools.py create mode 100755 modules/ariba/ariba_analysis/bin/helpers/bactopia-versions.py create mode 100755 modules/ariba/ariba_analysis/bin/mask-consensus.py create mode 100755 modules/ariba/ariba_analysis/bin/merge-blast-json.py create mode 100755 modules/ariba/ariba_analysis/bin/mlst-blast.py create mode 100755 modules/ariba/ariba_analysis/bin/select-references.py create mode 100755 modules/ariba/ariba_analysis/bin/split-coverages.py create mode 100755 modules/ariba/ariba_analysis/bin/update-conda.sh create mode 100755 modules/ariba/ariba_analysis/bin/update-docker.sh create mode 100755 modules/ariba/ariba_analysis/bin/update-tools.sh create mode 100755 modules/ariba/ariba_analysis/bin/update-version.sh create mode 100644 modules/ariba/ariba_analysis/nextflow.config create mode 100644 modules/ariba/ariba_analysis/templates/ariba_analysis.sh create mode 100644 modules/ariba/ariba_analysis/test_params.yaml create mode 100644 modules/blast/blast_genes/README.md create mode 100755 modules/blast/blast_genes/bin/build-containers.sh create mode 100755 modules/blast/blast_genes/bin/check-assembly-accession.py create mode 100755 modules/blast/blast_genes/bin/check-fastqs.py create mode 100755 modules/blast/blast_genes/bin/check-staging.py create mode 100755 modules/blast/blast_genes/bin/cleanup-coverage.py create mode 100755 modules/blast/blast_genes/bin/create-tool.sh create mode 100755 modules/blast/blast_genes/bin/gh-actions/free-disk-space.sh create mode 100755 modules/blast/blast_genes/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/blast/blast_genes/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-build.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-citations.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-datasets.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-prepare.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-pull.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-search.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-summary.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-tools.py create mode 100755 modules/blast/blast_genes/bin/helpers/bactopia-versions.py create mode 100755 modules/blast/blast_genes/bin/mask-consensus.py create mode 100755 modules/blast/blast_genes/bin/merge-blast-json.py create mode 100755 modules/blast/blast_genes/bin/mlst-blast.py create mode 100755 modules/blast/blast_genes/bin/select-references.py create mode 100755 modules/blast/blast_genes/bin/split-coverages.py create mode 100755 modules/blast/blast_genes/bin/update-conda.sh create mode 100755 modules/blast/blast_genes/bin/update-docker.sh create mode 100755 modules/blast/blast_genes/bin/update-tools.sh create mode 100755 modules/blast/blast_genes/bin/update-version.sh create mode 100644 modules/blast/blast_genes/blast_genes.nf create mode 100644 modules/blast/blast_genes/nextflow.config create mode 100644 modules/blast/blast_genes/templates/blast_genes.sh create mode 100644 modules/blast/blast_genes/test_params.yaml create mode 100644 modules/blast/blast_primers/README.md create mode 100755 modules/blast/blast_primers/bin/build-containers.sh create mode 100755 modules/blast/blast_primers/bin/check-assembly-accession.py create mode 100755 modules/blast/blast_primers/bin/check-fastqs.py create mode 100755 modules/blast/blast_primers/bin/check-staging.py create mode 100755 modules/blast/blast_primers/bin/cleanup-coverage.py create mode 100755 modules/blast/blast_primers/bin/create-tool.sh create mode 100755 modules/blast/blast_primers/bin/gh-actions/free-disk-space.sh create mode 100755 modules/blast/blast_primers/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/blast/blast_primers/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-build.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-citations.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-datasets.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-prepare.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-pull.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-search.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-summary.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-tools.py create mode 100755 modules/blast/blast_primers/bin/helpers/bactopia-versions.py create mode 100755 modules/blast/blast_primers/bin/mask-consensus.py create mode 100755 modules/blast/blast_primers/bin/merge-blast-json.py create mode 100755 modules/blast/blast_primers/bin/mlst-blast.py create mode 100755 modules/blast/blast_primers/bin/select-references.py create mode 100755 modules/blast/blast_primers/bin/split-coverages.py create mode 100755 modules/blast/blast_primers/bin/update-conda.sh create mode 100755 modules/blast/blast_primers/bin/update-docker.sh create mode 100755 modules/blast/blast_primers/bin/update-tools.sh create mode 100755 modules/blast/blast_primers/bin/update-version.sh create mode 100644 modules/blast/blast_primers/blast_primers.nf create mode 100644 modules/blast/blast_primers/nextflow.config create mode 100644 modules/blast/blast_primers/templates/blast_primers.sh create mode 100644 modules/blast/blast_primers/test_params.yaml create mode 100644 modules/blast/blast_proteins/README.md create mode 100755 modules/blast/blast_proteins/bin/build-containers.sh create mode 100755 modules/blast/blast_proteins/bin/check-assembly-accession.py create mode 100755 modules/blast/blast_proteins/bin/check-fastqs.py create mode 100755 modules/blast/blast_proteins/bin/check-staging.py create mode 100755 modules/blast/blast_proteins/bin/cleanup-coverage.py create mode 100755 modules/blast/blast_proteins/bin/create-tool.sh create mode 100755 modules/blast/blast_proteins/bin/gh-actions/free-disk-space.sh create mode 100755 modules/blast/blast_proteins/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/blast/blast_proteins/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-build.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-citations.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-datasets.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-prepare.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-pull.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-search.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-summary.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-tools.py create mode 100755 modules/blast/blast_proteins/bin/helpers/bactopia-versions.py create mode 100755 modules/blast/blast_proteins/bin/mask-consensus.py create mode 100755 modules/blast/blast_proteins/bin/merge-blast-json.py create mode 100755 modules/blast/blast_proteins/bin/mlst-blast.py create mode 100755 modules/blast/blast_proteins/bin/select-references.py create mode 100755 modules/blast/blast_proteins/bin/split-coverages.py create mode 100755 modules/blast/blast_proteins/bin/update-conda.sh create mode 100755 modules/blast/blast_proteins/bin/update-docker.sh create mode 100755 modules/blast/blast_proteins/bin/update-tools.sh create mode 100755 modules/blast/blast_proteins/bin/update-version.sh create mode 100644 modules/blast/blast_proteins/blast_proteins.nf create mode 100644 modules/blast/blast_proteins/nextflow.config create mode 100644 modules/blast/blast_proteins/templates/blast_proteins.sh create mode 100644 modules/blast/blast_proteins/test_params.yaml create mode 100644 modules/blast/make_blastdb/README.md create mode 100755 modules/blast/make_blastdb/bin/build-containers.sh create mode 100755 modules/blast/make_blastdb/bin/check-assembly-accession.py create mode 100755 modules/blast/make_blastdb/bin/check-fastqs.py create mode 100755 modules/blast/make_blastdb/bin/check-staging.py create mode 100755 modules/blast/make_blastdb/bin/cleanup-coverage.py create mode 100755 modules/blast/make_blastdb/bin/create-tool.sh create mode 100755 modules/blast/make_blastdb/bin/gh-actions/free-disk-space.sh create mode 100755 modules/blast/make_blastdb/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/blast/make_blastdb/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-build.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-citations.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-datasets.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-prepare.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-pull.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-search.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-summary.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-tools.py create mode 100755 modules/blast/make_blastdb/bin/helpers/bactopia-versions.py create mode 100755 modules/blast/make_blastdb/bin/mask-consensus.py create mode 100755 modules/blast/make_blastdb/bin/merge-blast-json.py create mode 100755 modules/blast/make_blastdb/bin/mlst-blast.py create mode 100755 modules/blast/make_blastdb/bin/select-references.py create mode 100755 modules/blast/make_blastdb/bin/split-coverages.py create mode 100755 modules/blast/make_blastdb/bin/update-conda.sh create mode 100755 modules/blast/make_blastdb/bin/update-docker.sh create mode 100755 modules/blast/make_blastdb/bin/update-tools.sh create mode 100755 modules/blast/make_blastdb/bin/update-version.sh create mode 100644 modules/blast/make_blastdb/make_blastdb.nf create mode 100644 modules/blast/make_blastdb/nextflow.config create mode 100644 modules/blast/make_blastdb/templates/make_blastdb.sh create mode 100644 modules/blast/make_blastdb/test_params.yaml create mode 100644 modules/blast/plasmid_blast/README.md create mode 100755 modules/blast/plasmid_blast/bin/build-containers.sh create mode 100755 modules/blast/plasmid_blast/bin/check-assembly-accession.py create mode 100755 modules/blast/plasmid_blast/bin/check-fastqs.py create mode 100755 modules/blast/plasmid_blast/bin/check-staging.py create mode 100755 modules/blast/plasmid_blast/bin/cleanup-coverage.py create mode 100755 modules/blast/plasmid_blast/bin/create-tool.sh create mode 100755 modules/blast/plasmid_blast/bin/gh-actions/free-disk-space.sh create mode 100755 modules/blast/plasmid_blast/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/blast/plasmid_blast/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-build.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-citations.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-datasets.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-prepare.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-pull.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-search.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-summary.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-tools.py create mode 100755 modules/blast/plasmid_blast/bin/helpers/bactopia-versions.py create mode 100755 modules/blast/plasmid_blast/bin/mask-consensus.py create mode 100755 modules/blast/plasmid_blast/bin/merge-blast-json.py create mode 100755 modules/blast/plasmid_blast/bin/mlst-blast.py create mode 100755 modules/blast/plasmid_blast/bin/select-references.py create mode 100755 modules/blast/plasmid_blast/bin/split-coverages.py create mode 100755 modules/blast/plasmid_blast/bin/update-conda.sh create mode 100755 modules/blast/plasmid_blast/bin/update-docker.sh create mode 100755 modules/blast/plasmid_blast/bin/update-tools.sh create mode 100755 modules/blast/plasmid_blast/bin/update-version.sh create mode 100644 modules/blast/plasmid_blast/nextflow.config create mode 100644 modules/blast/plasmid_blast/plasmid_blast.nf create mode 100644 modules/blast/plasmid_blast/templates/plasmid_blast.sh create mode 100644 modules/blast/plasmid_blast/test_params.yaml create mode 100644 modules/bwa/mapping_query/README.md create mode 100755 modules/bwa/mapping_query/bin/build-containers.sh create mode 100755 modules/bwa/mapping_query/bin/check-assembly-accession.py create mode 100755 modules/bwa/mapping_query/bin/check-fastqs.py create mode 100755 modules/bwa/mapping_query/bin/check-staging.py create mode 100755 modules/bwa/mapping_query/bin/cleanup-coverage.py create mode 100755 modules/bwa/mapping_query/bin/create-tool.sh create mode 100755 modules/bwa/mapping_query/bin/gh-actions/free-disk-space.sh create mode 100755 modules/bwa/mapping_query/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/bwa/mapping_query/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-build.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-citations.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-datasets.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-prepare.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-pull.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-search.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-summary.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-tools.py create mode 100755 modules/bwa/mapping_query/bin/helpers/bactopia-versions.py create mode 100755 modules/bwa/mapping_query/bin/mask-consensus.py create mode 100755 modules/bwa/mapping_query/bin/merge-blast-json.py create mode 100755 modules/bwa/mapping_query/bin/mlst-blast.py create mode 100755 modules/bwa/mapping_query/bin/select-references.py create mode 100755 modules/bwa/mapping_query/bin/split-coverages.py create mode 100755 modules/bwa/mapping_query/bin/update-conda.sh create mode 100755 modules/bwa/mapping_query/bin/update-docker.sh create mode 100755 modules/bwa/mapping_query/bin/update-tools.sh create mode 100755 modules/bwa/mapping_query/bin/update-version.sh create mode 100644 modules/bwa/mapping_query/mapping_query.nf create mode 100644 modules/bwa/mapping_query/nextflow.config create mode 100644 modules/bwa/mapping_query/templates/mapping_query.sh create mode 100644 modules/bwa/mapping_query/test_params.yaml create mode 100644 modules/mash/antimicrobial_resistance/README.md create mode 100644 modules/mash/antimicrobial_resistance/antimicrobial_resistance.nf create mode 100755 modules/mash/antimicrobial_resistance/bin/check-staging.py create mode 100644 modules/mash/antimicrobial_resistance/nextflow.config create mode 100644 modules/mash/antimicrobial_resistance/templates/antimicrobial_resistance.sh create mode 100644 modules/mash/antimicrobial_resistance/test_params.yaml create mode 100644 modules/mash/estimate_genome_size/README.md create mode 100755 modules/mash/estimate_genome_size/bin/build-containers.sh create mode 100755 modules/mash/estimate_genome_size/bin/check-assembly-accession.py create mode 100755 modules/mash/estimate_genome_size/bin/check-fastqs.py create mode 100755 modules/mash/estimate_genome_size/bin/check-staging.py create mode 100755 modules/mash/estimate_genome_size/bin/cleanup-coverage.py create mode 100755 modules/mash/estimate_genome_size/bin/create-tool.sh create mode 100755 modules/mash/estimate_genome_size/bin/gh-actions/free-disk-space.sh create mode 100755 modules/mash/estimate_genome_size/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/mash/estimate_genome_size/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-build.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-citations.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-datasets.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-prepare.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-pull.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-search.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-summary.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-tools.py create mode 100755 modules/mash/estimate_genome_size/bin/helpers/bactopia-versions.py create mode 100755 modules/mash/estimate_genome_size/bin/mask-consensus.py create mode 100755 modules/mash/estimate_genome_size/bin/merge-blast-json.py create mode 100755 modules/mash/estimate_genome_size/bin/mlst-blast.py create mode 100755 modules/mash/estimate_genome_size/bin/select-references.py create mode 100755 modules/mash/estimate_genome_size/bin/split-coverages.py create mode 100755 modules/mash/estimate_genome_size/bin/update-conda.sh create mode 100755 modules/mash/estimate_genome_size/bin/update-docker.sh create mode 100755 modules/mash/estimate_genome_size/bin/update-tools.sh create mode 100755 modules/mash/estimate_genome_size/bin/update-version.sh create mode 100644 modules/mash/estimate_genome_size/estimate_genome_size.nf create mode 100644 modules/mash/estimate_genome_size/nextflow.config create mode 100644 modules/mash/estimate_genome_size/templates/estimate_genome_size.sh create mode 100644 modules/mash/estimate_genome_size/test_params.yaml create mode 100644 modules/mash/estimate_genome_size/work/d8/5c04f254356b7f34402bdeb7477f57/test:estimate_genome_size/test:estimate_genome_size.sh create mode 100644 modules/mccortex/count_31mers/README.md create mode 100755 modules/mccortex/count_31mers/bin/build-containers.sh create mode 100755 modules/mccortex/count_31mers/bin/check-assembly-accession.py create mode 100755 modules/mccortex/count_31mers/bin/check-fastqs.py create mode 100755 modules/mccortex/count_31mers/bin/check-staging.py create mode 100755 modules/mccortex/count_31mers/bin/cleanup-coverage.py create mode 100755 modules/mccortex/count_31mers/bin/create-tool.sh create mode 100755 modules/mccortex/count_31mers/bin/gh-actions/free-disk-space.sh create mode 100755 modules/mccortex/count_31mers/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/mccortex/count_31mers/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-build.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-citations.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-datasets.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-prepare.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-pull.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-search.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-summary.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-tools.py create mode 100755 modules/mccortex/count_31mers/bin/helpers/bactopia-versions.py create mode 100755 modules/mccortex/count_31mers/bin/mask-consensus.py create mode 100755 modules/mccortex/count_31mers/bin/merge-blast-json.py create mode 100755 modules/mccortex/count_31mers/bin/mlst-blast.py create mode 100755 modules/mccortex/count_31mers/bin/select-references.py create mode 100755 modules/mccortex/count_31mers/bin/split-coverages.py create mode 100755 modules/mccortex/count_31mers/bin/update-conda.sh create mode 100755 modules/mccortex/count_31mers/bin/update-docker.sh create mode 100755 modules/mccortex/count_31mers/bin/update-tools.sh create mode 100755 modules/mccortex/count_31mers/bin/update-version.sh create mode 100644 modules/mccortex/count_31mers/count_31mers.nf create mode 100644 modules/mccortex/count_31mers/nextflow.config create mode 100644 modules/mccortex/count_31mers/templates/count_31mers.sh create mode 100644 modules/mccortex/count_31mers/test_params.yaml create mode 100644 modules/minmer/minmer_query/README.md create mode 100755 modules/minmer/minmer_query/bin/build-containers.sh create mode 100755 modules/minmer/minmer_query/bin/check-assembly-accession.py create mode 100755 modules/minmer/minmer_query/bin/check-fastqs.py create mode 100755 modules/minmer/minmer_query/bin/check-staging.py create mode 100755 modules/minmer/minmer_query/bin/cleanup-coverage.py create mode 100755 modules/minmer/minmer_query/bin/create-tool.sh create mode 100755 modules/minmer/minmer_query/bin/gh-actions/free-disk-space.sh create mode 100755 modules/minmer/minmer_query/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/minmer/minmer_query/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-build.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-citations.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-datasets.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-prepare.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-pull.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-search.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-summary.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-tools.py create mode 100755 modules/minmer/minmer_query/bin/helpers/bactopia-versions.py create mode 100755 modules/minmer/minmer_query/bin/mask-consensus.py create mode 100755 modules/minmer/minmer_query/bin/merge-blast-json.py create mode 100755 modules/minmer/minmer_query/bin/mlst-blast.py create mode 100755 modules/minmer/minmer_query/bin/select-references.py create mode 100755 modules/minmer/minmer_query/bin/split-coverages.py create mode 100755 modules/minmer/minmer_query/bin/update-conda.sh create mode 100755 modules/minmer/minmer_query/bin/update-docker.sh create mode 100755 modules/minmer/minmer_query/bin/update-tools.sh create mode 100755 modules/minmer/minmer_query/bin/update-version.sh create mode 100644 modules/minmer/minmer_query/minmer_query.nf create mode 100644 modules/minmer/minmer_query/nextflow.config create mode 100644 modules/minmer/minmer_query/templates/minmer_query.sh create mode 100644 modules/minmer/minmer_query/test_params.yaml create mode 100644 modules/minmer/minmer_sketch/README.md create mode 100755 modules/minmer/minmer_sketch/bin/build-containers.sh create mode 100755 modules/minmer/minmer_sketch/bin/check-assembly-accession.py create mode 100755 modules/minmer/minmer_sketch/bin/check-fastqs.py create mode 100755 modules/minmer/minmer_sketch/bin/check-staging.py create mode 100755 modules/minmer/minmer_sketch/bin/cleanup-coverage.py create mode 100755 modules/minmer/minmer_sketch/bin/create-tool.sh create mode 100755 modules/minmer/minmer_sketch/bin/gh-actions/free-disk-space.sh create mode 100755 modules/minmer/minmer_sketch/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/minmer/minmer_sketch/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-build.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-citations.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-datasets.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-prepare.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-pull.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-search.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-summary.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-tools.py create mode 100755 modules/minmer/minmer_sketch/bin/helpers/bactopia-versions.py create mode 100755 modules/minmer/minmer_sketch/bin/mask-consensus.py create mode 100755 modules/minmer/minmer_sketch/bin/merge-blast-json.py create mode 100755 modules/minmer/minmer_sketch/bin/mlst-blast.py create mode 100755 modules/minmer/minmer_sketch/bin/select-references.py create mode 100755 modules/minmer/minmer_sketch/bin/split-coverages.py create mode 100755 modules/minmer/minmer_sketch/bin/update-conda.sh create mode 100755 modules/minmer/minmer_sketch/bin/update-docker.sh create mode 100755 modules/minmer/minmer_sketch/bin/update-tools.sh create mode 100755 modules/minmer/minmer_sketch/bin/update-version.sh create mode 100644 modules/minmer/minmer_sketch/minmer_sketch.nf create mode 100644 modules/minmer/minmer_sketch/nextflow.config create mode 100644 modules/minmer/minmer_sketch/templates/minmer_sketch.sh create mode 100644 modules/minmer/minmer_sketch/test_params.yaml create mode 100644 modules/prokka/annotate_genome/README.md create mode 100644 modules/prokka/annotate_genome/annotate_genome.nf create mode 100755 modules/prokka/annotate_genome/bin/build-containers.sh create mode 100755 modules/prokka/annotate_genome/bin/check-assembly-accession.py create mode 100755 modules/prokka/annotate_genome/bin/check-fastqs.py create mode 100755 modules/prokka/annotate_genome/bin/check-staging.py create mode 100755 modules/prokka/annotate_genome/bin/cleanup-coverage.py create mode 100755 modules/prokka/annotate_genome/bin/create-tool.sh create mode 100755 modules/prokka/annotate_genome/bin/gh-actions/free-disk-space.sh create mode 100755 modules/prokka/annotate_genome/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/prokka/annotate_genome/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-build.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-citations.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-datasets.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-prepare.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-pull.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-search.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-summary.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-tools.py create mode 100755 modules/prokka/annotate_genome/bin/helpers/bactopia-versions.py create mode 100755 modules/prokka/annotate_genome/bin/mask-consensus.py create mode 100755 modules/prokka/annotate_genome/bin/merge-blast-json.py create mode 100755 modules/prokka/annotate_genome/bin/mlst-blast.py create mode 100755 modules/prokka/annotate_genome/bin/select-references.py create mode 100755 modules/prokka/annotate_genome/bin/split-coverages.py create mode 100755 modules/prokka/annotate_genome/bin/update-conda.sh create mode 100755 modules/prokka/annotate_genome/bin/update-docker.sh create mode 100755 modules/prokka/annotate_genome/bin/update-tools.sh create mode 100755 modules/prokka/annotate_genome/bin/update-version.sh create mode 100644 modules/prokka/annotate_genome/nextflow.config create mode 100644 modules/prokka/annotate_genome/templates/annotate_genome.sh create mode 100644 modules/prokka/annotate_genome/test_params.yaml create mode 100644 modules/shovill/assemble_genome/README.md create mode 100644 modules/shovill/assemble_genome/assemble_genome.nf create mode 100755 modules/shovill/assemble_genome/bin/build-containers.sh create mode 100755 modules/shovill/assemble_genome/bin/check-assembly-accession.py create mode 100755 modules/shovill/assemble_genome/bin/check-fastqs.py create mode 100755 modules/shovill/assemble_genome/bin/check-staging.py create mode 100755 modules/shovill/assemble_genome/bin/cleanup-coverage.py create mode 100755 modules/shovill/assemble_genome/bin/create-tool.sh create mode 100755 modules/shovill/assemble_genome/bin/gh-actions/free-disk-space.sh create mode 100755 modules/shovill/assemble_genome/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/shovill/assemble_genome/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-build.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-citations.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-datasets.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-prepare.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-pull.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-search.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-summary.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-tools.py create mode 100755 modules/shovill/assemble_genome/bin/helpers/bactopia-versions.py create mode 100755 modules/shovill/assemble_genome/bin/mask-consensus.py create mode 100755 modules/shovill/assemble_genome/bin/merge-blast-json.py create mode 100755 modules/shovill/assemble_genome/bin/mlst-blast.py create mode 100755 modules/shovill/assemble_genome/bin/select-references.py create mode 100755 modules/shovill/assemble_genome/bin/split-coverages.py create mode 100755 modules/shovill/assemble_genome/bin/update-conda.sh create mode 100755 modules/shovill/assemble_genome/bin/update-docker.sh create mode 100755 modules/shovill/assemble_genome/bin/update-tools.sh create mode 100755 modules/shovill/assemble_genome/bin/update-version.sh create mode 100644 modules/shovill/assemble_genome/nextflow.config create mode 100755 modules/shovill/assemble_genome/templates/assemble_genome.sh create mode 100644 modules/shovill/assemble_genome/test_params.yaml create mode 100644 modules/utilities/download_references/README.md create mode 100755 modules/utilities/download_references/bin/build-containers.sh create mode 100755 modules/utilities/download_references/bin/check-assembly-accession.py create mode 100755 modules/utilities/download_references/bin/check-fastqs.py create mode 100755 modules/utilities/download_references/bin/check-staging.py create mode 100755 modules/utilities/download_references/bin/cleanup-coverage.py create mode 100755 modules/utilities/download_references/bin/create-tool.sh create mode 100755 modules/utilities/download_references/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/download_references/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/download_references/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/download_references/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/download_references/bin/mask-consensus.py create mode 100755 modules/utilities/download_references/bin/merge-blast-json.py create mode 100755 modules/utilities/download_references/bin/mlst-blast.py create mode 100755 modules/utilities/download_references/bin/select-references.py create mode 100755 modules/utilities/download_references/bin/split-coverages.py create mode 100755 modules/utilities/download_references/bin/update-conda.sh create mode 100755 modules/utilities/download_references/bin/update-docker.sh create mode 100755 modules/utilities/download_references/bin/update-tools.sh create mode 100755 modules/utilities/download_references/bin/update-version.sh create mode 100644 modules/utilities/download_references/download_references.nf create mode 100644 modules/utilities/download_references/nextflow.config create mode 100644 modules/utilities/download_references/templates/download_references.sh create mode 100644 modules/utilities/download_references/test_params.yaml create mode 100644 modules/utilities/fastq_status/README.md create mode 100755 modules/utilities/fastq_status/bin/build-containers.sh create mode 100755 modules/utilities/fastq_status/bin/check-assembly-accession.py create mode 100755 modules/utilities/fastq_status/bin/check-fastqs.py create mode 100755 modules/utilities/fastq_status/bin/check-staging.py create mode 100755 modules/utilities/fastq_status/bin/cleanup-coverage.py create mode 100755 modules/utilities/fastq_status/bin/create-tool.sh create mode 100755 modules/utilities/fastq_status/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/fastq_status/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/fastq_status/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/fastq_status/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/fastq_status/bin/mask-consensus.py create mode 100755 modules/utilities/fastq_status/bin/merge-blast-json.py create mode 100755 modules/utilities/fastq_status/bin/mlst-blast.py create mode 100755 modules/utilities/fastq_status/bin/select-references.py create mode 100755 modules/utilities/fastq_status/bin/split-coverages.py create mode 100755 modules/utilities/fastq_status/bin/update-conda.sh create mode 100755 modules/utilities/fastq_status/bin/update-docker.sh create mode 100755 modules/utilities/fastq_status/bin/update-tools.sh create mode 100755 modules/utilities/fastq_status/bin/update-version.sh create mode 100644 modules/utilities/fastq_status/fastq_status.nf create mode 100644 modules/utilities/fastq_status/nextflow.config create mode 100644 modules/utilities/fastq_status/templates/fastq_status.sh create mode 100644 modules/utilities/fastq_status/test_params.yaml create mode 100644 modules/utilities/gather_fastqs/README.md create mode 100755 modules/utilities/gather_fastqs/bin/build-containers.sh create mode 100755 modules/utilities/gather_fastqs/bin/check-assembly-accession.py create mode 100755 modules/utilities/gather_fastqs/bin/check-fastqs.py create mode 100755 modules/utilities/gather_fastqs/bin/check-staging.py create mode 100755 modules/utilities/gather_fastqs/bin/cleanup-coverage.py create mode 100755 modules/utilities/gather_fastqs/bin/create-tool.sh create mode 100755 modules/utilities/gather_fastqs/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/gather_fastqs/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/gather_fastqs/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/gather_fastqs/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/gather_fastqs/bin/mask-consensus.py create mode 100755 modules/utilities/gather_fastqs/bin/merge-blast-json.py create mode 100755 modules/utilities/gather_fastqs/bin/mlst-blast.py create mode 100755 modules/utilities/gather_fastqs/bin/select-references.py create mode 100755 modules/utilities/gather_fastqs/bin/split-coverages.py create mode 100755 modules/utilities/gather_fastqs/bin/update-conda.sh create mode 100755 modules/utilities/gather_fastqs/bin/update-docker.sh create mode 100755 modules/utilities/gather_fastqs/bin/update-tools.sh create mode 100755 modules/utilities/gather_fastqs/bin/update-version.sh create mode 100644 modules/utilities/gather_fastqs/gather_fastqs.nf create mode 100644 modules/utilities/gather_fastqs/nextflow.config create mode 100644 modules/utilities/gather_fastqs/templates/gather_fastqs.sh create mode 100644 modules/utilities/gather_fastqs/test_params.yaml create mode 100644 modules/utilities/quality_control/assembly_qc/README.md create mode 100644 modules/utilities/quality_control/assembly_qc/assembly_qc.nf create mode 100755 modules/utilities/quality_control/assembly_qc/bin/build-containers.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/check-assembly-accession.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/check-fastqs.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/check-staging.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/cleanup-coverage.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/create-tool.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/mask-consensus.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/merge-blast-json.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/mlst-blast.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/select-references.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/split-coverages.py create mode 100755 modules/utilities/quality_control/assembly_qc/bin/update-conda.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/update-docker.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/update-tools.sh create mode 100755 modules/utilities/quality_control/assembly_qc/bin/update-version.sh create mode 100644 modules/utilities/quality_control/assembly_qc/nextflow.config create mode 100644 modules/utilities/quality_control/assembly_qc/templates/assembly_qc.sh create mode 100644 modules/utilities/quality_control/assembly_qc/test_params.yaml create mode 100644 modules/utilities/quality_control/qc_final_summary/README.md create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/build-containers.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/check-assembly-accession.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/check-fastqs.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/check-staging.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/cleanup-coverage.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/create-tool.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/mask-consensus.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/merge-blast-json.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/mlst-blast.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/select-references.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/split-coverages.py create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/update-conda.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/update-docker.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/update-tools.sh create mode 100755 modules/utilities/quality_control/qc_final_summary/bin/update-version.sh create mode 100644 modules/utilities/quality_control/qc_final_summary/nextflow.config create mode 100644 modules/utilities/quality_control/qc_final_summary/qc_final_summary.nf create mode 100644 modules/utilities/quality_control/qc_final_summary/templates/qc_final_summary.sh create mode 100644 modules/utilities/quality_control/qc_final_summary/test_params.yaml create mode 100644 modules/utilities/quality_control/qc_original_summary/README.md create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/build-containers.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/check-assembly-accession.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/check-fastqs.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/check-staging.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/cleanup-coverage.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/create-tool.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/mask-consensus.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/merge-blast-json.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/mlst-blast.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/select-references.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/split-coverages.py create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/update-conda.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/update-docker.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/update-tools.sh create mode 100755 modules/utilities/quality_control/qc_original_summary/bin/update-version.sh create mode 100644 modules/utilities/quality_control/qc_original_summary/nextflow.config create mode 100644 modules/utilities/quality_control/qc_original_summary/qc_original_summary.nf create mode 100644 modules/utilities/quality_control/qc_original_summary/templates/qc_original_summary.sh create mode 100644 modules/utilities/quality_control/qc_original_summary/test_params.yaml create mode 100644 modules/utilities/quality_control/qc_reads/README.md create mode 100755 modules/utilities/quality_control/qc_reads/bin/build-containers.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/check-assembly-accession.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/check-fastqs.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/check-staging.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/cleanup-coverage.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/create-tool.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/mask-consensus.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/merge-blast-json.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/mlst-blast.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/select-references.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/split-coverages.py create mode 100755 modules/utilities/quality_control/qc_reads/bin/update-conda.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/update-docker.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/update-tools.sh create mode 100755 modules/utilities/quality_control/qc_reads/bin/update-version.sh create mode 100644 modules/utilities/quality_control/qc_reads/nextflow.config create mode 100644 modules/utilities/quality_control/qc_reads/qc_reads.nf create mode 100755 modules/utilities/quality_control/qc_reads/templates/qc_reads.sh create mode 100644 modules/utilities/quality_control/qc_reads/test_params.yaml create mode 100644 modules/utilities/sequence_type/README.md create mode 100755 modules/utilities/sequence_type/bin/build-containers.sh create mode 100755 modules/utilities/sequence_type/bin/check-assembly-accession.py create mode 100755 modules/utilities/sequence_type/bin/check-fastqs.py create mode 100755 modules/utilities/sequence_type/bin/check-staging.py create mode 100755 modules/utilities/sequence_type/bin/cleanup-coverage.py create mode 100755 modules/utilities/sequence_type/bin/create-tool.sh create mode 100755 modules/utilities/sequence_type/bin/gh-actions/free-disk-space.sh create mode 100755 modules/utilities/sequence_type/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/utilities/sequence_type/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-build.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-citations.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-datasets.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-prepare.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-pull.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-search.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-summary.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-tools.py create mode 100755 modules/utilities/sequence_type/bin/helpers/bactopia-versions.py create mode 100755 modules/utilities/sequence_type/bin/mask-consensus.py create mode 100755 modules/utilities/sequence_type/bin/merge-blast-json.py create mode 100755 modules/utilities/sequence_type/bin/mlst-blast.py create mode 100755 modules/utilities/sequence_type/bin/select-references.py create mode 100755 modules/utilities/sequence_type/bin/split-coverages.py create mode 100755 modules/utilities/sequence_type/bin/update-conda.sh create mode 100755 modules/utilities/sequence_type/bin/update-docker.sh create mode 100755 modules/utilities/sequence_type/bin/update-tools.sh create mode 100755 modules/utilities/sequence_type/bin/update-version.sh create mode 100644 modules/utilities/sequence_type/nextflow.config create mode 100644 modules/utilities/sequence_type/sequence_type.nf create mode 100644 modules/utilities/sequence_type/templates/sequence_type.sh create mode 100644 modules/utilities/sequence_type/test_params.yaml create mode 100644 modules/variant_calling/call_variants/README.md create mode 100755 modules/variant_calling/call_variants/bin/build-containers.sh create mode 100755 modules/variant_calling/call_variants/bin/check-assembly-accession.py create mode 100755 modules/variant_calling/call_variants/bin/check-fastqs.py create mode 100755 modules/variant_calling/call_variants/bin/check-staging.py create mode 100755 modules/variant_calling/call_variants/bin/cleanup-coverage.py create mode 100755 modules/variant_calling/call_variants/bin/create-tool.sh create mode 100755 modules/variant_calling/call_variants/bin/gh-actions/free-disk-space.sh create mode 100755 modules/variant_calling/call_variants/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/variant_calling/call_variants/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-build.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-citations.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-datasets.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-prepare.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-pull.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-search.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-summary.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-tools.py create mode 100755 modules/variant_calling/call_variants/bin/helpers/bactopia-versions.py create mode 100755 modules/variant_calling/call_variants/bin/mask-consensus.py create mode 100755 modules/variant_calling/call_variants/bin/merge-blast-json.py create mode 100755 modules/variant_calling/call_variants/bin/mlst-blast.py create mode 100755 modules/variant_calling/call_variants/bin/select-references.py create mode 100755 modules/variant_calling/call_variants/bin/split-coverages.py create mode 100755 modules/variant_calling/call_variants/bin/update-conda.sh create mode 100755 modules/variant_calling/call_variants/bin/update-docker.sh create mode 100755 modules/variant_calling/call_variants/bin/update-tools.sh create mode 100755 modules/variant_calling/call_variants/bin/update-version.sh create mode 100644 modules/variant_calling/call_variants/call_variants.nf create mode 100644 modules/variant_calling/call_variants/nextflow.config create mode 100644 modules/variant_calling/call_variants/templates/call_variants.sh create mode 100644 modules/variant_calling/call_variants/test_params.yaml create mode 100644 modules/variant_calling/call_variants_auto/README.md create mode 100755 modules/variant_calling/call_variants_auto/bin/build-containers.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/check-assembly-accession.py create mode 100755 modules/variant_calling/call_variants_auto/bin/check-fastqs.py create mode 100755 modules/variant_calling/call_variants_auto/bin/check-staging.py create mode 100755 modules/variant_calling/call_variants_auto/bin/cleanup-coverage.py create mode 100755 modules/variant_calling/call_variants_auto/bin/create-tool.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/gh-actions/free-disk-space.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/gh-actions/setup-bactopia-env.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/gh-actions/setup-docker-builds.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-build.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-citations.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-datasets.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-prepare.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-pull.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-search.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-summary.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-tools.py create mode 100755 modules/variant_calling/call_variants_auto/bin/helpers/bactopia-versions.py create mode 100755 modules/variant_calling/call_variants_auto/bin/mask-consensus.py create mode 100755 modules/variant_calling/call_variants_auto/bin/merge-blast-json.py create mode 100755 modules/variant_calling/call_variants_auto/bin/mlst-blast.py create mode 100755 modules/variant_calling/call_variants_auto/bin/select-references.py create mode 100755 modules/variant_calling/call_variants_auto/bin/split-coverages.py create mode 100755 modules/variant_calling/call_variants_auto/bin/update-conda.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/update-docker.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/update-tools.sh create mode 100755 modules/variant_calling/call_variants_auto/bin/update-version.sh create mode 100644 modules/variant_calling/call_variants_auto/call_variants_auto.nf create mode 100644 modules/variant_calling/call_variants_auto/nextflow.config create mode 100644 modules/variant_calling/call_variants_auto/templates/call_variants_auto.sh create mode 100644 modules/variant_calling/call_variants_auto/test_params.yaml diff --git a/.gitignore b/.gitignore index d1d3818a7..f37b9062c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /datasets/ +test_data /conda/envs /conf/aws.config .nextflow* diff --git a/main.nf b/main.nf old mode 100755 new mode 100644 index beb688481..8f907d218 --- a/main.nf +++ b/main.nf @@ -52,7 +52,7 @@ REFSEQ_SKETCH = [] REFSEQ_SKETCH_FOUND = false SPECIES = format_species(params.species) SPECIES_GENOME_SIZE = null -print_efficiency() +print_efficiency() setup_datasets() @@ -85,7 +85,7 @@ process gather_fastqs { if (task.attempt >= 4) { if (use_ena) { // Try SRA - use_ena = false + use_ena = false } else { // Try ENA use_ena = true @@ -120,7 +120,7 @@ process fastq_status { output: file "*-error.txt" optional true - set val(sample), val(sample_type), val(single_end), + set val(sample), val(sample_type), val(single_end), file("fastqs/${sample}*.fastq.gz"), file(extra) optional true into ESTIMATE_GENOME_SIZE file "${task.process}/*" optional true @@ -143,7 +143,7 @@ process estimate_genome_size { output: file "${sample}-genome-size-error.txt" optional true file("${sample}-genome-size.txt") optional true - set val(sample), val(sample_type), val(single_end), + set val(sample), val(sample_type), val(single_end), file("fastqs/${sample}*.fastq.gz"), file(extra), file("${sample}-genome-size.txt") optional true into QC_READS, QC_ORIGINAL_SUMMARY file "${task.process}/*" optional true @@ -876,7 +876,7 @@ def get_max_cpus(requested) { log.warn "Maximum CPUs (${requested}) was adjusted to fit your system (${available})" return available } - + return requested } @@ -946,8 +946,8 @@ def setup_datasets() { species_db = available_datasets['species-specific'][SPECIES] if (species_db.containsKey('genome_size')) { genome_size = species_db['genome_size'] - } - + } + if (params.genome_size) { if (['min', 'median', 'mean', 'max'].contains(params.genome_size)) { SPECIES_GENOME_SIZE = genome_size[params.genome_size] @@ -1019,7 +1019,7 @@ def setup_datasets() { } print_dataset_info(REFERENCES, "reference genomes") } - + if (species_db['optional'].containsKey('mapping-sequences')) { file("${dataset_path}/${species_db['optional']['mapping-sequences']}").list().each() { if (dataset_exists("${dataset_path}/${species_db['optional']['mapping-sequences']}/${it}")) { @@ -1211,10 +1211,10 @@ def check_input_params() { ### For Downloading from SRA/ENA or NCBI Assembly **Note: Assemblies will have error free Illumina reads simulated for processing.** - --accessions An input file containing ENA/SRA Experiment accessions or + --accessions An input file containing ENA/SRA Experiment accessions or NCBI Assembly accessions to be processed - --accession A single ENA/SRA Experiment accession or NCBI Assembly accession + --accession A single ENA/SRA Experiment accession or NCBI Assembly accession to be processed ### For Processing an Assembly @@ -1238,7 +1238,7 @@ def check_input_params() { if (params.max_downloads >= 10) { log.warn "Please be aware the value you have set for --max_downloads (${params.max_downloads}) may cause NCBI " + - "to temporarily block your IP address due to too many queries at once." + "to temporarily block your IP address due to too many queries at once." } if (params.genome_size) { @@ -1299,7 +1299,7 @@ def check_input_params() { def handle_multiple_fqs(read_set) { def fqs = [] def String[] reads = read_set.split(","); - reads.each { fq -> + reads.each { fq -> fqs << file(fq) } return fqs @@ -1429,7 +1429,7 @@ def check_input_fastqs(run_type) { } count = count + 1 } - if (count > 1) { + if (count > 1) { USING_MERGE = true } } @@ -1519,7 +1519,7 @@ def print_efficiency() { tasks = total_cpus / MAX_CPUS log.info "" log.info """ - Each task will use ${MAX_CPUS} CPUs out of the available ${total_cpus} CPUs. At most ${tasks} task(s) will be run at + Each task will use ${MAX_CPUS} CPUs out of the available ${total_cpus} CPUs. At most ${tasks} task(s) will be run at a time, this can affect the efficiency of Bactopia. """.stripIndent() log.info "" @@ -1571,10 +1571,10 @@ def basic_help() { ### For Downloading from SRA/ENA or NCBI Assembly **Note: Assemblies will have error free Illumina reads simulated for processing.** - --accessions An input file containing ENA/SRA Experiment accessions or + --accessions An input file containing ENA/SRA Experiment accessions or NCBI Assembly accessions to be processed - --accession A single ENA/SRA Experiment accession or NCBI Assembly accession + --accession A single ENA/SRA Experiment accession or NCBI Assembly accession to be processed ### For Processing an Assembly @@ -1608,12 +1608,12 @@ def basic_help() { Default: ${params.outdir} Nextflow Queue Parameters: - At execution, Nextflow creates a queue and the number of slots in the queue is determined by the total number - of cores on the system. When a task is submitted to the queue, the total number of slots it occupies is - determined by the value set by "--cpus". + At execution, Nextflow creates a queue and the number of slots in the queue is determined by the total number + of cores on the system. When a task is submitted to the queue, the total number of slots it occupies is + determined by the value set by "--cpus". - This can have a significant effect on the efficiency of the Nextflow's queue system. If "--cpus" is set to a - value that is equal to the number of cores availabe, in most cases only a single task will be able to run + This can have a significant effect on the efficiency of the Nextflow's queue system. If "--cpus" is set to a + value that is equal to the number of cores availabe, in most cases only a single task will be able to run because its occupying all available slots. When in doubt, "--cpus 4" is a safe bet, it is also the default value if you don't use "--cpus". @@ -1630,10 +1630,10 @@ def basic_help() { --max_memory INT The maximum amount of memory (Gb) allowed to a single task. Default: ${params.max_memory} Gb - --cpus INT Number of processors made available to a single task. + --cpus INT Number of processors made available to a single task. Default: ${params.cpus} - -qs INT Nextflow queue size. This parameter is very useful to limit the total number of + -qs INT Nextflow queue size. This parameter is very useful to limit the total number of processors used on desktops, laptops or shared resources. Default: Nextflow defaults to the total number of processors on your system. @@ -1660,9 +1660,9 @@ def basic_help() { --disable_scratch All intermediate files created on worker nodes of will be transferred to the head node. Default: Only result files are transferred back - --nfconfig STR A Nextflow compatible config file for custom profiles. This allows + --nfconfig STR A Nextflow compatible config file for custom profiles. This allows you to create profiles specific to your environment (e.g. SGE, - AWS, SLURM, etc...). This config file is loaded last and will + AWS, SLURM, etc...). This config file is loaded last and will overwrite existing variables if set. Default: Bactopia's default configs @@ -1678,16 +1678,16 @@ def basic_help() { --publish_mode Set Nextflow's method for publishing output files. Allowed methods are: 'copy' (default) Copies the output files into the published directory. - 'copyNoFollow' Copies the output files into the published directory + 'copyNoFollow' Copies the output files into the published directory without following symlinks ie. copies the links themselves. - 'link' Creates a hard link in the published directory for each + 'link' Creates a hard link in the published directory for each process output file. 'rellink' Creates a relative symbolic link in the published directory for each process output file. - 'symlink' Creates an absolute symbolic link in the published directory + 'symlink' Creates an absolute symbolic link in the published directory for each process output file. Default: ${params.publish_mode} @@ -1695,7 +1695,7 @@ def basic_help() { --force Nextflow will overwrite existing output files. Default: ${params.force} - -resume Nextflow will attempt to resume a previous run. Please notice it is + -resume Nextflow will attempt to resume a previous run. Please notice it is only a single '-' --cleanup_workdir After Bactopia is successfully executed, the work directory will be deleted. @@ -1767,7 +1767,7 @@ def full_help() { Default: ${params.aws_max_retry} --aws_ecr_registry STR The ECR registry containing Bactopia related containers. - Default: Use the registry given by --registry + Default: Use the registry given by --registry ENA Download Parameters: --max_downloads INT Maximum number of FASTQs to download at once. @@ -1794,16 +1794,16 @@ def full_help() { to continue downstream analyses. Default: ${params.min_reads} - --min_proportion FLOAT The minimum proportion of basepairs for paired-end reads to continue - downstream analyses. Example: If set to 0.75 the R1 and R2 must - have > 75% proportion of reads (e.g. R1 100bp, R2 75bp, not + --min_proportion FLOAT The minimum proportion of basepairs for paired-end reads to continue + downstream analyses. Example: If set to 0.75 the R1 and R2 must + have > 75% proportion of reads (e.g. R1 100bp, R2 75bp, not R1 100bp, R2 50bp) Default: ${params.min_proportion} --skip_fastq_check The input FASTQs will not be check to verify they meet the - minimum requirements to be processed. This parameter - is useful if you are confident your sequences will - pass the minimum requirements. + minimum requirements to be processed. This parameter + is useful if you are confident your sequences will + pass the minimum requirements. Estimate Genome Size Parameters: Only applied if the genome size is estimated. @@ -1950,54 +1950,54 @@ def full_help() { Default: ${params.unicycler_ram} GB --unicycler_mode STR Bridging mode used by Unicycler, choices are: - conservative = smaller contigs, lowest + conservative = smaller contigs, lowest misassembly rate - normal = moderate contig size and + normal = moderate contig size and misassembly rate (Default) - bold = longest contigs, higher misassembly + bold = longest contigs, higher misassembly rate - --min_polish_size INT Contigs shorter than this value (bp) will not be + --min_polish_size INT Contigs shorter than this value (bp) will not be polished using Pilon Default: ${params.min_polish_size} --min_component_size INT - Graph components smaller than this size (bp) will + Graph components smaller than this size (bp) will be removed from the final graph Default: ${params.min_component_size} - --min_dead_end_size INT - Graph dead ends smaller than this size (bp) will + --min_dead_end_size INT + Graph dead ends smaller than this size (bp) will be removed from the final graph Default: ${params.min_dead_end_size} - --no_miniasm Skip miniasm+Racon bridging + --no_miniasm Skip miniasm+Racon bridging Default: Produce long-read bridges - --no_rotate Do not rotate completed replicons to start at a + --no_rotate Do not rotate completed replicons to start at a standard gene - --no_pilon Do not use Pilon to polish the final assembly + --no_pilon Do not use Pilon to polish the final assembly Assembly Quality Control Parameters: - --skip_checkm CheckM analysis will be skipped. This is useful for systems + --skip_checkm CheckM analysis will be skipped. This is useful for systems with less than 8GB of memory. - --checkm_unique INT Minimum number of unique phylogenetic markers required + --checkm_unique INT Minimum number of unique phylogenetic markers required to use lineage-specific marker set. Default: ${params.checkm_unique} - + --checkm_multi INT Maximum number of multi-copy phylogenetic markers before defaulting to domain-level marker set. Default: ${params.checkm_multi} - + --aai_strain FLOAT AAI threshold used to identify strain heterogeneity Default: ${params.aai_strain} - + --checkm_length FLOAT Percent overlap between target and query Default: ${params.checkm_length} - --full_tree Use the full tree (requires ~40GB of memory) for determining + --full_tree Use the full tree (requires ~40GB of memory) for determining lineage of each bin. Default: Use reduced tree (<16gb memory) @@ -2014,17 +2014,17 @@ def full_help() { --no_refinement Do not perform lineage-specific marker set refinement - --individual_markers Treat marker as independent (i.e., ignore co-located + --individual_markers Treat marker as independent (i.e., ignore co-located set structure. - --skip_adj_correction Do not exclude adjacent marker genes when estimating + --skip_adj_correction Do not exclude adjacent marker genes when estimating contamination --contig_thresholds STR Comma-separated list of contig length thresholds Default: ${params.contig_thresholds} --plots_format STR Save plots in specified format. - Supported formats: emf, eps, pdf, png, ps, raw, + Supported formats: emf, eps, pdf, png, ps, raw, rgba, svg, svgz Default: ${params.plots_format} @@ -2055,7 +2055,7 @@ def full_help() { Default: ${params.prokka_coverage} --nogenes Do not add 'gene' features for each 'CDS' feature - + --norrna Don't run rRNA search --notrna Don't run tRNA search @@ -2213,7 +2213,7 @@ def full_help() { Default: ${params.bwa_n} Antimicrobial Resistance Parameters: - --skip_amr AMRFinder+ analysis will be skipped. This is useful + --skip_amr AMRFinder+ analysis will be skipped. This is useful if the AMRFinder+ software and database versions are no longer compatible. @@ -2235,6 +2235,6 @@ def full_help() { --amr_plus Add the plus genes to the report --amr_report_common Suppress proteins common to a taxonomy group - + """ } diff --git a/modules/ariba/ariba_analysis/README.md b/modules/ariba/ariba_analysis/README.md new file mode 100644 index 000000000..1ad561c35 --- /dev/null +++ b/modules/ariba/ariba_analysis/README.md @@ -0,0 +1,16 @@ +# ariba_analysis process testing: + +This process run reads against all available (if any) ARIBA datasets +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run ariba_analysis.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/ariba/ariba_analysis/ariba_analysis.nf b/modules/ariba/ariba_analysis/ariba_analysis.nf new file mode 100644 index 000000000..3a5b9346a --- /dev/null +++ b/modules/ariba/ariba_analysis/ariba_analysis.nf @@ -0,0 +1,51 @@ +nextflow.enable.dsl = 2 + +process ARIBA_ANALYSIS { + /* Run reads against all available (if any) ARIBA datasets */ + tag "${sample} - ${dataset_name}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/ariba", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${dataset_name}/*" + + input: + tuple val(sample), val(single_end), path(fq) + each path(dataset) + + output: + file "${dataset_name}/*" + file "${task.process}/*" optional true + + when: + single_end == false && ARIBA_DATABASES.isEmpty() == false + + shell: + dataset_tarball = path(dataset).getName() + dataset_name = dataset_tarball.replace('.tar.gz', '') + spades_options = params.spades_options ? "--spades_options '${params.spades_options}'" : "" + noclean = params.ariba_no_clean ? "--noclean" : "" + + template "ariba_analysis.sh" + stub: + dataset_tarball = path(dataset).getName() + dataset_name = dataset_tarball.replace('.tar.gz', '') + """ + mkdir ${dataset_name} + mkdir ${task.process} + touch ${dataset_name}/${sample} + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq) + ]) + TEST_PARAMS_CH2 = Channel.of(path(params.card),path(params.vfdb)) + ariba_analysis(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect()) +} diff --git a/modules/ariba/ariba_analysis/bin/build-containers.sh b/modules/ariba/ariba_analysis/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/ariba/ariba_analysis/bin/check-assembly-accession.py b/modules/ariba/ariba_analysis/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/ariba/ariba_analysis/bin/check-fastqs.py b/modules/ariba/ariba_analysis/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/ariba/ariba_analysis/bin/check-staging.py b/modules/ariba/ariba_analysis/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/ariba/ariba_analysis/bin/cleanup-coverage.py b/modules/ariba/ariba_analysis/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/ariba/ariba_analysis/bin/create-tool.sh b/modules/ariba/ariba_analysis/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/ariba/ariba_analysis/bin/gh-actions/free-disk-space.sh b/modules/ariba/ariba_analysis/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/ariba/ariba_analysis/bin/gh-actions/setup-bactopia-env.sh b/modules/ariba/ariba_analysis/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/ariba/ariba_analysis/bin/gh-actions/setup-docker-builds.py b/modules/ariba/ariba_analysis/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-build.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-citations.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-datasets.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-prepare.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-pull.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-search.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-summary.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-tools.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/ariba/ariba_analysis/bin/helpers/bactopia-versions.py b/modules/ariba/ariba_analysis/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/ariba/ariba_analysis/bin/mask-consensus.py b/modules/ariba/ariba_analysis/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/ariba/ariba_analysis/bin/merge-blast-json.py b/modules/ariba/ariba_analysis/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/ariba/ariba_analysis/bin/mlst-blast.py b/modules/ariba/ariba_analysis/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/ariba/ariba_analysis/bin/select-references.py b/modules/ariba/ariba_analysis/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/ariba/ariba_analysis/bin/split-coverages.py b/modules/ariba/ariba_analysis/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/ariba/ariba_analysis/bin/update-conda.sh b/modules/ariba/ariba_analysis/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/ariba/ariba_analysis/bin/update-docker.sh b/modules/ariba/ariba_analysis/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/ariba/ariba_analysis/bin/update-tools.sh b/modules/ariba/ariba_analysis/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/ariba/ariba_analysis/bin/update-version.sh b/modules/ariba/ariba_analysis/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/ariba/ariba_analysis/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/ariba/ariba_analysis/nextflow.config b/modules/ariba/ariba_analysis/nextflow.config new file mode 100644 index 000000000..d8d234015 --- /dev/null +++ b/modules/ariba/ariba_analysis/nextflow.config @@ -0,0 +1,40 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: ariba_analysis { + conda = "${baseDir}/../../../conda/envs/ariba_analysis-1.7.x"} + } + } + + docker { + process { + withName: ariba_analysis { + container = "ghcr.io/bactopia/ariba_analysis:1.6.0"} + + } + } + test { + env { + container_version = "1.6.x" + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + ARIBA_DATABASES = ["card", "vfdb_core"] + } + + } +} diff --git a/modules/ariba/ariba_analysis/templates/ariba_analysis.sh b/modules/ariba/ariba_analysis/templates/ariba_analysis.sh new file mode 100644 index 000000000..caac74d2a --- /dev/null +++ b/modules/ariba/ariba_analysis/templates/ariba_analysis.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} + fi +fi + +tar -xzvf !{dataset_tarball} +mv !{dataset_name} !{dataset_name}db +# ariba Version +echo "# Ariba Version" >> ${LOG_DIR}/!{task.process}.versions +ariba version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +ariba run !{dataset_name}db !{fq} !{dataset_name} \ + --nucmer_min_id !{params.nucmer_min_id} \ + --nucmer_min_len !{params.nucmer_min_len} \ + --nucmer_breaklen !{params.nucmer_breaklen} \ + --assembly_cov !{params.assembly_cov} \ + --min_scaff_depth !{params.min_scaff_depth} \ + --assembled_threshold !{params.assembled_threshold} \ + --gene_nt_extend !{params.gene_nt_extend} \ + --unique_threshold !{params.unique_threshold} \ + --threads !{task.cpus} \ + --force \ + --verbose !{noclean} !{spades_options} > ${LOG_DIR}/ariba.out 2> ${LOG_DIR}/ariba.err + +ariba summary !{dataset_name}/summary !{dataset_name}/report.tsv \ + --cluster_cols assembled,match,known_var,pct_id,ctg_cov,novel_var \ + --col_filter n --row_filter n > ${LOG_DIR}/ariba-summary.out 2> ${LOG_DIR}/ariba-summary.err + +rm -rf ariba.tmp* + +if [ "!{params.keep_all_files}" == "false" ]; then + # Remove Ariba DB that was untarred + rm -rf !{dataset_name}db +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/ariba/ariba_analysis/test_params.yaml b/modules/ariba/ariba_analysis/test_params.yaml new file mode 100644 index 000000000..64809d5cf --- /dev/null +++ b/modules/ariba/ariba_analysis/test_params.yaml @@ -0,0 +1,68 @@ +outdir: + "test_output" + +sample + "TEST_SAMPLE" + +sample_type: + "paired-end" + +single_end: + false + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +card: + "test_data/card.tar.gz" + +vfdb: + "test_data/vfdb_core.tar.gz" + + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" +keep_all_files: + false + +skip_logs: + false + +nucmer_min_id: + 90 + +nucmer_min_len: + 20 + +nucmer_breaklen: + 200 + +assembly_cov: + 50 + +min_scaff_depth: + 10 + +spades_options: + null + +assembled_threshold: + 0.95 + +gene_nt_extend: + 30 + +unique_threshold: + 0.03 + +ariba_no_clean: + false diff --git a/modules/blast/blast_genes/README.md b/modules/blast/blast_genes/README.md new file mode 100644 index 000000000..3815bd254 --- /dev/null +++ b/modules/blast/blast_genes/README.md @@ -0,0 +1,17 @@ +# blast_genes process testing: + +This process queries gene FASTA files against annotated assembly using BLAST + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run blast_genes.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/blast/blast_genes/bin/build-containers.sh b/modules/blast/blast_genes/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/blast/blast_genes/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/blast/blast_genes/bin/check-assembly-accession.py b/modules/blast/blast_genes/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/blast/blast_genes/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/blast/blast_genes/bin/check-fastqs.py b/modules/blast/blast_genes/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/blast/blast_genes/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/blast/blast_genes/bin/check-staging.py b/modules/blast/blast_genes/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/blast/blast_genes/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/blast/blast_genes/bin/cleanup-coverage.py b/modules/blast/blast_genes/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/blast/blast_genes/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/blast/blast_genes/bin/create-tool.sh b/modules/blast/blast_genes/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/blast/blast_genes/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/blast/blast_genes/bin/gh-actions/free-disk-space.sh b/modules/blast/blast_genes/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/blast/blast_genes/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/blast/blast_genes/bin/gh-actions/setup-bactopia-env.sh b/modules/blast/blast_genes/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/blast/blast_genes/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/blast/blast_genes/bin/gh-actions/setup-docker-builds.py b/modules/blast/blast_genes/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/blast/blast_genes/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-build.py b/modules/blast/blast_genes/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-citations.py b/modules/blast/blast_genes/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-datasets.py b/modules/blast/blast_genes/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-prepare.py b/modules/blast/blast_genes/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-pull.py b/modules/blast/blast_genes/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-search.py b/modules/blast/blast_genes/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-summary.py b/modules/blast/blast_genes/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-tools.py b/modules/blast/blast_genes/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/blast/blast_genes/bin/helpers/bactopia-versions.py b/modules/blast/blast_genes/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/blast/blast_genes/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/blast/blast_genes/bin/mask-consensus.py b/modules/blast/blast_genes/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/blast/blast_genes/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/blast/blast_genes/bin/merge-blast-json.py b/modules/blast/blast_genes/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/blast/blast_genes/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/blast/blast_genes/bin/mlst-blast.py b/modules/blast/blast_genes/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/blast/blast_genes/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/blast/blast_genes/bin/select-references.py b/modules/blast/blast_genes/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/blast/blast_genes/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/blast/blast_genes/bin/split-coverages.py b/modules/blast/blast_genes/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/blast/blast_genes/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/blast/blast_genes/bin/update-conda.sh b/modules/blast/blast_genes/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/blast/blast_genes/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_genes/bin/update-docker.sh b/modules/blast/blast_genes/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/blast/blast_genes/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/blast/blast_genes/bin/update-tools.sh b/modules/blast/blast_genes/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/blast/blast_genes/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_genes/bin/update-version.sh b/modules/blast/blast_genes/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/blast/blast_genes/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/blast/blast_genes/blast_genes.nf b/modules/blast/blast_genes/blast_genes.nf new file mode 100644 index 000000000..de92a5bdf --- /dev/null +++ b/modules/blast/blast_genes/blast_genes.nf @@ -0,0 +1,50 @@ +nextflow.enable.dsl = 2 + +process BLAST_GENES { + /* + Query gene FASTA files against annotated assembly using BLAST + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/blast", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "genes/*.{json,json.gz}" + + input: + tuple val(sample), path(blastdb) + path(query) + + output: + path("genes/*.{json,json.gz}") + file "${task.process}/*" optional true + + when: + BLAST_GENE_FASTAS.isEmpty() == false + + shell: + template "blast_genes.sh" + + stub: + """ + mkdir ${task.process} + mkdir genes + touch ${task.process}/${sample} + touch genes/${sample}.json + touch genes/${sample}.json.gz + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.blastdb), + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.query) + ) + + blast_genes(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} diff --git a/modules/blast/blast_genes/nextflow.config b/modules/blast/blast_genes/nextflow.config new file mode 100644 index 000000000..5634611a5 --- /dev/null +++ b/modules/blast/blast_genes/nextflow.config @@ -0,0 +1,46 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: blast_genes { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.7.x"} + } + } + + docker { + process { + withName: blast_genes { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + withName: blast_genes { + cpus = 2 + queue = 'long' + } + } + env { + BLAST_GENE_FASTAS = ["genes"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "SRR2838702" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + } + + } +} diff --git a/modules/blast/blast_genes/templates/blast_genes.sh b/modules/blast/blast_genes/templates/blast_genes.sh new file mode 100644 index 000000000..4357edc36 --- /dev/null +++ b/modules/blast/blast_genes/templates/blast_genes.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -e +set -u + +LOG_DIR="!{task.process}" +OUTDIR=genes +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# blastn Version" >> ${LOG_DIR}/!{task.process}.versions +blastn -version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# Parallel Version" >> ${LOG_DIR}/!{task.process}.versions +parallel --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +mkdir -p ${OUTDIR} +for fasta in *.fasta; do + type=`readlink -f ${fasta}` + name="${fasta%.*}" + mkdir -p temp_json + cat ${fasta} | sed -e 's/<[^>]*>//g' | + parallel --gnu --plain -j !{task.cpus} --recstart '>' -N 1 --pipe \ + blastn -db !{sample} \ + -outfmt 15 \ + -evalue 1 \ + -perc_identity !{params.perc_identity} \ + -qcov_hsp_perc !{params.qcov_hsp_perc} \ + -query - \ + -out temp_json/${name}_{#}.json + + merge-blast-json.py temp_json > ${OUTDIR}/${name}.json + rm -rf temp_json + + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/${name}.json + fi +done + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/blast/blast_genes/test_params.yaml b/modules/blast/blast_genes/test_params.yaml new file mode 100644 index 000000000..c270c80a2 --- /dev/null +++ b/modules/blast/blast_genes/test_params.yaml @@ -0,0 +1,41 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +blastdb: + "test_data/SRR2838702*" + +query: + "test_data/dumb-gene.fasta" + +overwrite: + false + +compress: + false + +skip_logs: + false + +perc_identity: + 50 + +qcov_hsp_perc: + 50 + +max_target_seqs: + 2000 diff --git a/modules/blast/blast_primers/README.md b/modules/blast/blast_primers/README.md new file mode 100644 index 000000000..046f71754 --- /dev/null +++ b/modules/blast/blast_primers/README.md @@ -0,0 +1,17 @@ +# blast_primers process testing: + +This process queries primer FASTA files against annotated assembly using BLAST + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run blast_primers.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/blast/blast_primers/bin/build-containers.sh b/modules/blast/blast_primers/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/blast/blast_primers/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/blast/blast_primers/bin/check-assembly-accession.py b/modules/blast/blast_primers/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/blast/blast_primers/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/blast/blast_primers/bin/check-fastqs.py b/modules/blast/blast_primers/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/blast/blast_primers/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/blast/blast_primers/bin/check-staging.py b/modules/blast/blast_primers/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/blast/blast_primers/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/blast/blast_primers/bin/cleanup-coverage.py b/modules/blast/blast_primers/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/blast/blast_primers/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/blast/blast_primers/bin/create-tool.sh b/modules/blast/blast_primers/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/blast/blast_primers/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/blast/blast_primers/bin/gh-actions/free-disk-space.sh b/modules/blast/blast_primers/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/blast/blast_primers/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/blast/blast_primers/bin/gh-actions/setup-bactopia-env.sh b/modules/blast/blast_primers/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/blast/blast_primers/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/blast/blast_primers/bin/gh-actions/setup-docker-builds.py b/modules/blast/blast_primers/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/blast/blast_primers/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-build.py b/modules/blast/blast_primers/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-citations.py b/modules/blast/blast_primers/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-datasets.py b/modules/blast/blast_primers/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-prepare.py b/modules/blast/blast_primers/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-pull.py b/modules/blast/blast_primers/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-search.py b/modules/blast/blast_primers/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-summary.py b/modules/blast/blast_primers/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-tools.py b/modules/blast/blast_primers/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/blast/blast_primers/bin/helpers/bactopia-versions.py b/modules/blast/blast_primers/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/blast/blast_primers/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/blast/blast_primers/bin/mask-consensus.py b/modules/blast/blast_primers/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/blast/blast_primers/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/blast/blast_primers/bin/merge-blast-json.py b/modules/blast/blast_primers/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/blast/blast_primers/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/blast/blast_primers/bin/mlst-blast.py b/modules/blast/blast_primers/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/blast/blast_primers/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/blast/blast_primers/bin/select-references.py b/modules/blast/blast_primers/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/blast/blast_primers/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/blast/blast_primers/bin/split-coverages.py b/modules/blast/blast_primers/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/blast/blast_primers/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/blast/blast_primers/bin/update-conda.sh b/modules/blast/blast_primers/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/blast/blast_primers/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_primers/bin/update-docker.sh b/modules/blast/blast_primers/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/blast/blast_primers/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/blast/blast_primers/bin/update-tools.sh b/modules/blast/blast_primers/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/blast/blast_primers/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_primers/bin/update-version.sh b/modules/blast/blast_primers/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/blast/blast_primers/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/blast/blast_primers/blast_primers.nf b/modules/blast/blast_primers/blast_primers.nf new file mode 100644 index 000000000..d628bd617 --- /dev/null +++ b/modules/blast/blast_primers/blast_primers.nf @@ -0,0 +1,50 @@ +nextflow.enable.dsl = 2 + +process BLAST_PRIMERS { + /* + Query primer FASTA files against annotated assembly using BLAST + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/blast", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "primers/*.{json,json.gz}" + + input: + tuple val(sample), path(blastdb) + path(query) + + output: + path("primers/*.{json,json.gz}") + file "${task.process}/*" optional true + + when: + BLAST_PRIMER_FASTAS.isEmpty() == false + + shell: + template "blast_primers.sh" + + stub: + """ + mkdir ${task.process} + mkdir primers + touch ${task.process}/${sample} + touch primers/${sample}.json + touch primers/${sample}.json.gz + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.blastdb), + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.query) + ) + + blast_primers(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} diff --git a/modules/blast/blast_primers/nextflow.config b/modules/blast/blast_primers/nextflow.config new file mode 100644 index 000000000..ca26c7e8e --- /dev/null +++ b/modules/blast/blast_primers/nextflow.config @@ -0,0 +1,47 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: blast_primers { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.7.x"} + } + } + + docker { + process { + withName: blast_primers { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + echo = true + withName: blast_primers { + cpus = 2 + queue = 'long' + } + } + env { + BLAST_PRIMER_FASTAS = ["primer"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "SRR2838702" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + } + + } +} diff --git a/modules/blast/blast_primers/templates/blast_primers.sh b/modules/blast/blast_primers/templates/blast_primers.sh new file mode 100644 index 000000000..063537b8f --- /dev/null +++ b/modules/blast/blast_primers/templates/blast_primers.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e +set -u + +LOG_DIR="!{task.process}" +OUTDIR=primers +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# blastn Version" >> ${LOG_DIR}/!{task.process}.versions +blastn -version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# Parallel Version" >> ${LOG_DIR}/!{task.process}.versions +parallel --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +for fasta in *.fasta; do + type=`readlink -f ${fasta}` + name="${fasta%.*}" + mkdir -p ${OUTDIR} temp_json + cat ${fasta} | sed -e 's/<[^>]*>//g' | + parallel --gnu --plain -j !{task.cpus} --recstart '>' -N 1 --pipe \ + blastn -db !{sample} \ + -outfmt 15 \ + -task blastn \ + -dust no \ + -word_size 7 \ + -perc_identity !{params.perc_identity} \ + -evalue 1 \ + -query - \ + -out temp_json/${name}_{#}.json + + merge-blast-json.py temp_json > ${OUTDIR}/${name}.json + rm -rf temp_json + + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/${name}.json + fi +done + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/blast/blast_primers/test_params.yaml b/modules/blast/blast_primers/test_params.yaml new file mode 100644 index 000000000..58fe415bc --- /dev/null +++ b/modules/blast/blast_primers/test_params.yaml @@ -0,0 +1,42 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +blastdb: + "test_data/SRR2838702*" + +query: + "test_data/dumb-primer.fasta" + +overwrite: + false + +compress: + false + +skip_logs: + false + +perc_identity: + 50 + +qcov_hsp_perc: + 50 + +max_target_seqs: + 2000 diff --git a/modules/blast/blast_proteins/README.md b/modules/blast/blast_proteins/README.md new file mode 100644 index 000000000..da0cfc1ed --- /dev/null +++ b/modules/blast/blast_proteins/README.md @@ -0,0 +1,17 @@ +# blast_proteins process testing: + +This process queries protein FASTA files against annotated assembly using BLAST + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run blast_proteins.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/blast/blast_proteins/bin/build-containers.sh b/modules/blast/blast_proteins/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/blast/blast_proteins/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/blast/blast_proteins/bin/check-assembly-accession.py b/modules/blast/blast_proteins/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/blast/blast_proteins/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/blast/blast_proteins/bin/check-fastqs.py b/modules/blast/blast_proteins/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/blast/blast_proteins/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/blast/blast_proteins/bin/check-staging.py b/modules/blast/blast_proteins/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/blast/blast_proteins/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/blast/blast_proteins/bin/cleanup-coverage.py b/modules/blast/blast_proteins/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/blast/blast_proteins/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/blast/blast_proteins/bin/create-tool.sh b/modules/blast/blast_proteins/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/blast/blast_proteins/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/blast/blast_proteins/bin/gh-actions/free-disk-space.sh b/modules/blast/blast_proteins/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/blast/blast_proteins/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/blast/blast_proteins/bin/gh-actions/setup-bactopia-env.sh b/modules/blast/blast_proteins/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/blast/blast_proteins/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/blast/blast_proteins/bin/gh-actions/setup-docker-builds.py b/modules/blast/blast_proteins/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/blast/blast_proteins/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-build.py b/modules/blast/blast_proteins/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-citations.py b/modules/blast/blast_proteins/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-datasets.py b/modules/blast/blast_proteins/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-prepare.py b/modules/blast/blast_proteins/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-pull.py b/modules/blast/blast_proteins/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-search.py b/modules/blast/blast_proteins/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-summary.py b/modules/blast/blast_proteins/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-tools.py b/modules/blast/blast_proteins/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/blast/blast_proteins/bin/helpers/bactopia-versions.py b/modules/blast/blast_proteins/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/blast/blast_proteins/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/blast/blast_proteins/bin/mask-consensus.py b/modules/blast/blast_proteins/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/blast/blast_proteins/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/blast/blast_proteins/bin/merge-blast-json.py b/modules/blast/blast_proteins/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/blast/blast_proteins/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/blast/blast_proteins/bin/mlst-blast.py b/modules/blast/blast_proteins/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/blast/blast_proteins/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/blast/blast_proteins/bin/select-references.py b/modules/blast/blast_proteins/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/blast/blast_proteins/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/blast/blast_proteins/bin/split-coverages.py b/modules/blast/blast_proteins/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/blast/blast_proteins/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/blast/blast_proteins/bin/update-conda.sh b/modules/blast/blast_proteins/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/blast/blast_proteins/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_proteins/bin/update-docker.sh b/modules/blast/blast_proteins/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/blast/blast_proteins/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/blast/blast_proteins/bin/update-tools.sh b/modules/blast/blast_proteins/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/blast/blast_proteins/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/blast_proteins/bin/update-version.sh b/modules/blast/blast_proteins/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/blast/blast_proteins/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/blast/blast_proteins/blast_proteins.nf b/modules/blast/blast_proteins/blast_proteins.nf new file mode 100644 index 000000000..d4d709d44 --- /dev/null +++ b/modules/blast/blast_proteins/blast_proteins.nf @@ -0,0 +1,51 @@ +nextflow.enable.dsl = 2 + +process BLAST_PROTEINS { + /* + Query protein FASTA files against annotated assembly using BLAST + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/blast", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "proteins/*.{json,json.gz}" + + input: + tuple val(sample), path(blastdb) + path(query) + + output: + path("proteins/*.{json,json.gz}") + file "${task.process}/*" optional true + + when: + BLAST_PROTEIN_FASTAS.isEmpty() == false + + shell: + + template "blast_proteins.sh" + + stub: + """ + mkdir ${task.process} + mkdir proteins + touch ${task.process}/${sample} + touch proteins/${sample}.json + touch proteins/${sample}.json.gz + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.blastdb), + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.query) + ) + + blast_proteins(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} diff --git a/modules/blast/blast_proteins/nextflow.config b/modules/blast/blast_proteins/nextflow.config new file mode 100644 index 000000000..7b705cfed --- /dev/null +++ b/modules/blast/blast_proteins/nextflow.config @@ -0,0 +1,46 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: blast_proteins { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.7.x"} + } + } + + docker { + process { + withName: blast_proteins { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + withName: blast_proteins { + cpus = 2 + queue = 'long' + } + } + env { + BLAST_PROTEIN_FASTAS = ["protein"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "2838702" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + } + + } +} diff --git a/modules/blast/blast_proteins/templates/blast_proteins.sh b/modules/blast/blast_proteins/templates/blast_proteins.sh new file mode 100644 index 000000000..d1f7cd533 --- /dev/null +++ b/modules/blast/blast_proteins/templates/blast_proteins.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -e +set -u + +LOG_DIR="!{task.process}" +OUTDIR=proteins +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# tblastn Version" >> ${LOG_DIR}/!{task.process}.versions +tblastn -version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# Parallel Version" >> ${LOG_DIR}/!{task.process}.versions +parallel --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +for fasta in *.fasta; do + type=`readlink -f ${fasta}` + name="${fasta%.*}" + mkdir -p ${OUTDIR} temp_json + cat ${fasta} | sed -e 's/<[^>]*>//g' | + parallel --gnu --plain -j !{task.cpus} --recstart '>' -N 1 --pipe \ + tblastn -db !{sample} \ + -outfmt 15 \ + -evalue 0.0001 \ + -qcov_hsp_perc !{params.qcov_hsp_perc} \ + -query - \ + -out temp_json/${name}_{#}.json + + merge-blast-json.py temp_json > ${OUTDIR}/${name}.json + rm -rf temp_json + + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/${name}.json + fi +done + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/blast/blast_proteins/test_params.yaml b/modules/blast/blast_proteins/test_params.yaml new file mode 100644 index 000000000..b014f7500 --- /dev/null +++ b/modules/blast/blast_proteins/test_params.yaml @@ -0,0 +1,41 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +blastdb: + "test_data/SRR2838702*" + +query: + "test_data/dumb-protein.fasta" + +overwrite: + false + +compress: + false + +skip_logs: + false + +perc_identity: + 50 + +qcov_hsp_perc: + 50 + +max_target_seqs: + 2000 diff --git a/modules/blast/make_blastdb/README.md b/modules/blast/make_blastdb/README.md new file mode 100644 index 000000000..325bb359b --- /dev/null +++ b/modules/blast/make_blastdb/README.md @@ -0,0 +1,17 @@ +# make_blastdb process testing: + +This process create a BLAST database of the assembly using BLAST + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run make_blastdb.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/blast/make_blastdb/bin/build-containers.sh b/modules/blast/make_blastdb/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/blast/make_blastdb/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/blast/make_blastdb/bin/check-assembly-accession.py b/modules/blast/make_blastdb/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/blast/make_blastdb/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/blast/make_blastdb/bin/check-fastqs.py b/modules/blast/make_blastdb/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/blast/make_blastdb/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/blast/make_blastdb/bin/check-staging.py b/modules/blast/make_blastdb/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/blast/make_blastdb/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/blast/make_blastdb/bin/cleanup-coverage.py b/modules/blast/make_blastdb/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/blast/make_blastdb/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/blast/make_blastdb/bin/create-tool.sh b/modules/blast/make_blastdb/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/blast/make_blastdb/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/blast/make_blastdb/bin/gh-actions/free-disk-space.sh b/modules/blast/make_blastdb/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/blast/make_blastdb/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/blast/make_blastdb/bin/gh-actions/setup-bactopia-env.sh b/modules/blast/make_blastdb/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/blast/make_blastdb/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/blast/make_blastdb/bin/gh-actions/setup-docker-builds.py b/modules/blast/make_blastdb/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/blast/make_blastdb/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-build.py b/modules/blast/make_blastdb/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-citations.py b/modules/blast/make_blastdb/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-datasets.py b/modules/blast/make_blastdb/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-prepare.py b/modules/blast/make_blastdb/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-pull.py b/modules/blast/make_blastdb/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-search.py b/modules/blast/make_blastdb/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-summary.py b/modules/blast/make_blastdb/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-tools.py b/modules/blast/make_blastdb/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/blast/make_blastdb/bin/helpers/bactopia-versions.py b/modules/blast/make_blastdb/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/blast/make_blastdb/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/blast/make_blastdb/bin/mask-consensus.py b/modules/blast/make_blastdb/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/blast/make_blastdb/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/blast/make_blastdb/bin/merge-blast-json.py b/modules/blast/make_blastdb/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/blast/make_blastdb/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/blast/make_blastdb/bin/mlst-blast.py b/modules/blast/make_blastdb/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/blast/make_blastdb/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/blast/make_blastdb/bin/select-references.py b/modules/blast/make_blastdb/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/blast/make_blastdb/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/blast/make_blastdb/bin/split-coverages.py b/modules/blast/make_blastdb/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/blast/make_blastdb/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/blast/make_blastdb/bin/update-conda.sh b/modules/blast/make_blastdb/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/blast/make_blastdb/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/make_blastdb/bin/update-docker.sh b/modules/blast/make_blastdb/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/blast/make_blastdb/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/blast/make_blastdb/bin/update-tools.sh b/modules/blast/make_blastdb/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/blast/make_blastdb/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/make_blastdb/bin/update-version.sh b/modules/blast/make_blastdb/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/blast/make_blastdb/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/blast/make_blastdb/make_blastdb.nf b/modules/blast/make_blastdb/make_blastdb.nf new file mode 100644 index 000000000..012537f07 --- /dev/null +++ b/modules/blast/make_blastdb/make_blastdb.nf @@ -0,0 +1,43 @@ +nextflow.enable.dsl = 2 + +process MAKE_BLASTDB { + /* Create a BLAST database of the assembly using BLAST */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/blast", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "blastdb/*" + + input: + tuple val(sample), val(single_end), path(fasta) + + output: + path("blastdb/*") + tuple val(sample), path("blastdb/*"), emit: BLAST_DB, optional:true + file "${task.process}/*" optional true + + shell: + template "make_blastdb.sh" + + stub: + """ + mkdir blastdb + mkdir ${task.process} + touch blastdb/${sample} + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fasta) + ]) + + make_blastdb(TEST_PARAMS_CH) +} diff --git a/modules/blast/make_blastdb/nextflow.config b/modules/blast/make_blastdb/nextflow.config new file mode 100644 index 000000000..c6a5ed1bb --- /dev/null +++ b/modules/blast/make_blastdb/nextflow.config @@ -0,0 +1,46 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: make_blastdb { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.7.x"} + } + } + + docker { + process { + withName: make_blastdb { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + withName: make_blastdb { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/blast/make_blastdb/templates/make_blastdb.sh b/modules/blast/make_blastdb/templates/make_blastdb.sh new file mode 100644 index 000000000..98ff58e7d --- /dev/null +++ b/modules/blast/make_blastdb/templates/make_blastdb.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir blastdb +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# makeblastdb Version" >> ${LOG_DIR}/!{task.process}.versions +makeblastdb -version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fasta}" ]]; then + check-staging.py --assembly !{fasta} +fi + +if [[ !{params.compress} == "true" ]]; then + gzip -cd !{fasta} | \ + makeblastdb -dbtype "nucl" -title "Assembled contigs for !{sample}" -out blastdb/!{sample} +else + cat !{fasta} | \ + makeblastdb -dbtype "nucl" -title "Assembled contigs for !{sample}" -out blastdb/!{sample} +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/blast/make_blastdb/test_params.yaml b/modules/blast/make_blastdb/test_params.yaml new file mode 100644 index 000000000..969ce69ae --- /dev/null +++ b/modules/blast/make_blastdb/test_params.yaml @@ -0,0 +1,30 @@ + +outdir: + "test_output" + +sample: + "TEST_SAMPLE" + +single_end: + "test" + +fasta: + "test_data/assembly.fna" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + some_value + +compress: + false + +skip_logs: + false diff --git a/modules/blast/plasmid_blast/README.md b/modules/blast/plasmid_blast/README.md new file mode 100644 index 000000000..eba9d9a90 --- /dev/null +++ b/modules/blast/plasmid_blast/README.md @@ -0,0 +1,17 @@ +# plasmid_blast process testing: + +This process BLAST a set of predicted genes against the PLSDB BLAST database + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run plasmid_blast.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/blast/plasmid_blast/bin/build-containers.sh b/modules/blast/plasmid_blast/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/blast/plasmid_blast/bin/check-assembly-accession.py b/modules/blast/plasmid_blast/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/blast/plasmid_blast/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/blast/plasmid_blast/bin/check-fastqs.py b/modules/blast/plasmid_blast/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/blast/plasmid_blast/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/blast/plasmid_blast/bin/check-staging.py b/modules/blast/plasmid_blast/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/blast/plasmid_blast/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/blast/plasmid_blast/bin/cleanup-coverage.py b/modules/blast/plasmid_blast/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/blast/plasmid_blast/bin/create-tool.sh b/modules/blast/plasmid_blast/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/blast/plasmid_blast/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/blast/plasmid_blast/bin/gh-actions/free-disk-space.sh b/modules/blast/plasmid_blast/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/blast/plasmid_blast/bin/gh-actions/setup-bactopia-env.sh b/modules/blast/plasmid_blast/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/blast/plasmid_blast/bin/gh-actions/setup-docker-builds.py b/modules/blast/plasmid_blast/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-build.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-citations.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-datasets.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-prepare.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-pull.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-search.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-summary.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-tools.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/blast/plasmid_blast/bin/helpers/bactopia-versions.py b/modules/blast/plasmid_blast/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/blast/plasmid_blast/bin/mask-consensus.py b/modules/blast/plasmid_blast/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/blast/plasmid_blast/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/blast/plasmid_blast/bin/merge-blast-json.py b/modules/blast/plasmid_blast/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/blast/plasmid_blast/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/blast/plasmid_blast/bin/mlst-blast.py b/modules/blast/plasmid_blast/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/blast/plasmid_blast/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/blast/plasmid_blast/bin/select-references.py b/modules/blast/plasmid_blast/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/blast/plasmid_blast/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/blast/plasmid_blast/bin/split-coverages.py b/modules/blast/plasmid_blast/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/blast/plasmid_blast/bin/update-conda.sh b/modules/blast/plasmid_blast/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/plasmid_blast/bin/update-docker.sh b/modules/blast/plasmid_blast/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/blast/plasmid_blast/bin/update-tools.sh b/modules/blast/plasmid_blast/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/blast/plasmid_blast/bin/update-version.sh b/modules/blast/plasmid_blast/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/blast/plasmid_blast/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/blast/plasmid_blast/nextflow.config b/modules/blast/plasmid_blast/nextflow.config new file mode 100644 index 000000000..98e1824ec --- /dev/null +++ b/modules/blast/plasmid_blast/nextflow.config @@ -0,0 +1,47 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: plasmid_blast { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.6.x"} + } + } + + docker { + process { + withName: plasmid_blast { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + withName: plasmid_blast { + cpus = 2 + queue = 'long' + } + + } + env { + PLASMID_BLASTDB = ["plasmid"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/blast/plasmid_blast/plasmid_blast.nf b/modules/blast/plasmid_blast/plasmid_blast.nf new file mode 100644 index 000000000..364c18f4c --- /dev/null +++ b/modules/blast/plasmid_blast/plasmid_blast.nf @@ -0,0 +1,51 @@ +nextflow.enable.dsl = 2 + +process PLASMID_BLAST { + /* + BLAST a set of predicted genes against the PLSDB BLAST database. + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/blast", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "*.{json,json.gz}" + + input: + tuple val(sample), path(genes) + path(blastdb_files) + + output: + path("${sample}-plsdb.{json,json.gz}") + path("${task.process}/*" optional true + + when: + PLASMID_BLASTDB.isEmpty() == false + + shell: + gunzip_genes = genes.getName().replace('.gz', '') + blastdb = blastdb_files[0].getBaseName() + template "plasmid_blast.sh" + + stub: + """ + mkdir ${task.process} + touch ${task.process}/${sample} + touch ${sample}-plsdb.json + touch ${sample}-plsdb.json.gz + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.genes), + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.blastdb_files) + ) + + plasmid_blast(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} diff --git a/modules/blast/plasmid_blast/templates/plasmid_blast.sh b/modules/blast/plasmid_blast/templates/plasmid_blast.sh new file mode 100644 index 000000000..3a02e6af3 --- /dev/null +++ b/modules/blast/plasmid_blast/templates/plasmid_blast.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e +set -u + +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} + +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +echo "# blastn Version" >> ${LOG_DIR}/!{task.process}.versions +blastn -version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# Parallel Version" >> ${LOG_DIR}/!{task.process}.versions +parallel --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +if [[ !{params.compress} == "true" ]]; then + gunzip -f !{genes} +fi + +file_size=`cat !{gunzip_genes} | wc -c` +block_size=$(( file_size / !{task.cpus} / 2 )) +mkdir -p temp_json +cat !{gunzip_genes} | sed -e 's/<[^>]*>//g' | \ +parallel --gnu --plain -j !{task.cpus} --block ${block_size} --recstart '>' --pipe \ +blastn -db !{blastdb} \ + -outfmt 15 \ + -task blastn \ + -evalue 1 \ + -max_target_seqs !{params.max_target_seqs} \ + -perc_identity !{params.perc_identity} \ + -qcov_hsp_perc !{params.qcov_hsp_perc} \ + -query - \ + -out temp_json/!{sample}_{#}.json + +merge-blast-json.py temp_json > !{sample}-plsdb.json +rm -rf temp_json + + +if [[ !{params.compress} == "true" ]]; then + pigz --best -n -p !{task.cpus} !{sample}-plsdb.json +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/blast/plasmid_blast/test_params.yaml b/modules/blast/plasmid_blast/test_params.yaml new file mode 100644 index 000000000..5d8fca572 --- /dev/null +++ b/modules/blast/plasmid_blast/test_params.yaml @@ -0,0 +1,47 @@ +genome_size: + "test_data/genome-size.txt" + +outdir: + "test_output" + +sample: + "TEST_SAMPLE" + +single_end: + "test" + +fasta: + "test_data/test_database.fasta" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +genes: + "test_data/genes.fasta" + +blastdb_files: + "test_data/blastdb_files.{nhr,nin,nog,nsd,nsi,nsq}" + +overwrite: + false + +compress: + false + +skip_logs: + false + +perc_identity: + 50 + +qcov_hsp_perc: + 50 + +max_target_seqs: + 2000 diff --git a/modules/bwa/mapping_query/README.md b/modules/bwa/mapping_query/README.md new file mode 100644 index 000000000..0e47ea212 --- /dev/null +++ b/modules/bwa/mapping_query/README.md @@ -0,0 +1,17 @@ +# mapping_query process testing: + +This process maps FASTQ reads against a given set of FASTA files using BWA. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run mappingg_query.nf -profile test,docker -params-file test_params.yaml -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by `conda` to test with conda. diff --git a/modules/bwa/mapping_query/bin/build-containers.sh b/modules/bwa/mapping_query/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/bwa/mapping_query/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/bwa/mapping_query/bin/check-assembly-accession.py b/modules/bwa/mapping_query/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/bwa/mapping_query/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/bwa/mapping_query/bin/check-fastqs.py b/modules/bwa/mapping_query/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/bwa/mapping_query/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/bwa/mapping_query/bin/check-staging.py b/modules/bwa/mapping_query/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/bwa/mapping_query/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/bwa/mapping_query/bin/cleanup-coverage.py b/modules/bwa/mapping_query/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/bwa/mapping_query/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/bwa/mapping_query/bin/create-tool.sh b/modules/bwa/mapping_query/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/bwa/mapping_query/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/bwa/mapping_query/bin/gh-actions/free-disk-space.sh b/modules/bwa/mapping_query/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/bwa/mapping_query/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/bwa/mapping_query/bin/gh-actions/setup-bactopia-env.sh b/modules/bwa/mapping_query/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/bwa/mapping_query/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/bwa/mapping_query/bin/gh-actions/setup-docker-builds.py b/modules/bwa/mapping_query/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/bwa/mapping_query/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-build.py b/modules/bwa/mapping_query/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-citations.py b/modules/bwa/mapping_query/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-datasets.py b/modules/bwa/mapping_query/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-prepare.py b/modules/bwa/mapping_query/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-pull.py b/modules/bwa/mapping_query/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-search.py b/modules/bwa/mapping_query/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-summary.py b/modules/bwa/mapping_query/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-tools.py b/modules/bwa/mapping_query/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/bwa/mapping_query/bin/helpers/bactopia-versions.py b/modules/bwa/mapping_query/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/bwa/mapping_query/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/bwa/mapping_query/bin/mask-consensus.py b/modules/bwa/mapping_query/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/bwa/mapping_query/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/bwa/mapping_query/bin/merge-blast-json.py b/modules/bwa/mapping_query/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/bwa/mapping_query/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/bwa/mapping_query/bin/mlst-blast.py b/modules/bwa/mapping_query/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/bwa/mapping_query/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/bwa/mapping_query/bin/select-references.py b/modules/bwa/mapping_query/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/bwa/mapping_query/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/bwa/mapping_query/bin/split-coverages.py b/modules/bwa/mapping_query/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/bwa/mapping_query/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/bwa/mapping_query/bin/update-conda.sh b/modules/bwa/mapping_query/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/bwa/mapping_query/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/bwa/mapping_query/bin/update-docker.sh b/modules/bwa/mapping_query/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/bwa/mapping_query/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/bwa/mapping_query/bin/update-tools.sh b/modules/bwa/mapping_query/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/bwa/mapping_query/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/bwa/mapping_query/bin/update-version.sh b/modules/bwa/mapping_query/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/bwa/mapping_query/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/bwa/mapping_query/mapping_query.nf b/modules/bwa/mapping_query/mapping_query.nf new file mode 100644 index 000000000..ac5ae50b7 --- /dev/null +++ b/modules/bwa/mapping_query/mapping_query.nf @@ -0,0 +1,53 @@ +nextflow.enable.dsl = 2 + +process MAPPING_QUERY { + /* + Map FASTQ reads against a given set of FASTA files using BWA. + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "mapping/*" + + input: + tuple val(sample), val(single_end), path(fq) + each path(query) + + output: + file "mapping/*" + file "${task.process}/*" optional true + + when: + MAPPING_FASTAS.isEmpty() == false + + shell: + bwa_mem_opts = params.bwa_mem_opts ? params.bwa_mem_opts : "" + bwa_aln_opts = params.bwa_aln_opts ? params.bwa_aln_opts : "" + bwa_samse_opts = params.bwa_samse_opts ? params.bwa_samse_opts : "" + bwa_sampe_opts = params.bwa_sampe_opts ? params.bwa_sampe_opts : "" + template "mapping_query.sh" + + stub: + """ + mkdir ${task.process} + mkdir mapping + touch ${task.process}/${sample} + touch mapping/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq) + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.query) + ) + mapping_query(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect()) +} diff --git a/modules/bwa/mapping_query/nextflow.config b/modules/bwa/mapping_query/nextflow.config new file mode 100644 index 000000000..a1c00c66e --- /dev/null +++ b/modules/bwa/mapping_query/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: mapping_query { + conda = "${baseDir}/../../../conda/envs/call_variants-1.7.x"} + } + } + + docker { + process { + withName: mapping_query { + container = "ghcr.io/bactopia/call_variants:1.6.0"} + + } + } + + test { + process { + withName: mapping_query { + cpus = 2 + queue = 'long' + } + + } + env { + MAPPING_FASTAS = ["query1"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + } + + } +} diff --git a/modules/bwa/mapping_query/templates/mapping_query.sh b/modules/bwa/mapping_query/templates/mapping_query.sh new file mode 100644 index 000000000..0eab4d746 --- /dev/null +++ b/modules/bwa/mapping_query/templates/mapping_query.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{query} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{query} + fi +fi + +avg_len=`seqtk fqchk !{fq[0]} | head -n 1 | sed -r 's/.*avg_len: ([0-9]+).*;.*/\1/'` +ls *.fasta | xargs -I {} grep -H "^>" {} | awk '{print $1}' | sed 's/.fasta:>/\t/' > mapping.txt +cat *.fasta > multifasta.fa + +echo "# bwa Version" >> ${LOG_DIR}/!{task.process}.versions +bwa 2>&1 | grep "Version" >> ${LOG_DIR}/!{task.process}.versions 2>&1 +bwa index multifasta.fa > ${LOG_DIR}/bwa-index.out 2> ${LOG_DIR}/bwa-index.err +if [ "${avg_len}" -gt "70" ]; then + bwa mem -M -t !{task.cpus} !{bwa_mem_opts} multifasta.fa !{fq} > bwa.sam +else + if [ "!{single_end}" == "true" ]; then + bwa aln -f bwa.sai -t !{task.cpus} !{bwa_aln_opts} multifasta.fa !{fq[0]} > ${LOG_DIR}/bwa-aln.out 2> ${LOG_DIR}/bwa-aln.err + bwa samse -n !{params.bwa_n} !{bwa_samse_opts} multifasta.fa bwa.sai !{fq[0]} > bwa.sam 2> ${LOG_DIR}/bwa-samse.err + else + bwa aln -f r1.sai -t !{task.cpus} !{bwa_aln_opts} multifasta.fa !{fq[0]} > ${LOG_DIR}/bwa-aln.out 2> ${LOG_DIR}/bwa-aln.err + bwa aln -f r2.sai -t !{task.cpus} !{bwa_aln_opts} multifasta.fa !{fq[1]} >> ${LOG_DIR}/bwa-aln.out 2>> ${LOG_DIR}/bwa-aln.err + bwa sampe -n !{params.bwa_n} !{bwa_sampe_opts} multifasta.fa r1.sai r2.sai !{fq[0]} !{fq[1]} > bwa.sam 2> ${LOG_DIR}/bwa-sampe.err + fi +fi +# Write per-base coverage +echo "# samtools Version" >> ${LOG_DIR}/!{task.process}.versions +samtools 2>&1 | grep "Version" >> ${LOG_DIR}/!{task.process}.versions 2>&1 +samtools view -bS bwa.sam | samtools sort -o cov.bam - > ${LOG_DIR}/samtools.out 2> ${LOG_DIR}/samtools.err + +echo "# bedtools Version" >> ${LOG_DIR}/!{task.process}.versions +bedtools --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +genomeCoverageBed -ibam cov.bam -d > cov.txt 2> ${LOG_DIR}/genomeCoverageBed.err +split-coverages.py mapping.txt cov.txt --outdir mapping + +if [[ !{params.compress} == "true" ]]; then + pigz --best -n -p !{task.cpus} mapping/*.txt +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/bwa/mapping_query/test_params.yaml b/modules/bwa/mapping_query/test_params.yaml new file mode 100644 index 000000000..6d186028a --- /dev/null +++ b/modules/bwa/mapping_query/test_params.yaml @@ -0,0 +1,53 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + "false" + +sample_type: + "single_end" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +query: + "test_data/dumb-gene.fasta" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +skip_logs: + false + +bwa_mem_opts: + null + +bwa_aln_opts: + null + +bwa_samse_opts: + null + +bwa_sampe_opts: + null + +bwa_n: + 9999 + +keep_unmapped_reads: + false + +compress: + false diff --git a/modules/mash/antimicrobial_resistance/README.md b/modules/mash/antimicrobial_resistance/README.md new file mode 100644 index 000000000..c75f5b244 --- /dev/null +++ b/modules/mash/antimicrobial_resistance/README.md @@ -0,0 +1,17 @@ +# antimicrobial_resistance process testing: + +This process queries nucleotides and proteins (SNPs/InDels) against one or more reference genomes selected based on their Mash distance from the input. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run antimicrobial_resistance.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/mash/antimicrobial_resistance/antimicrobial_resistance.nf b/modules/mash/antimicrobial_resistance/antimicrobial_resistance.nf new file mode 100644 index 000000000..7b8ad1388 --- /dev/null +++ b/modules/mash/antimicrobial_resistance/antimicrobial_resistance.nf @@ -0,0 +1,57 @@ +nextflow.enable.dsl = 2 + +process ANTIMICROBIAL_RESISTANCE { + /* + Query nucleotides and proteins (SNPs/InDels) against one or more reference genomes selected based + on their Mash distance from the input. + */ + tag "${sample}" + + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "logs/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${amrdir}/*" + + input: + tuple val(sample), path(genes), path(proteins) + each path(amrdb) + + output: + path "${amrdir}/*" + path "logs/*" optional true + + shell: + amrdir = "antimicrobial-resistance" + plus = params.amr_plus ? "--plus" : "" + report_common = params.amr_report_common ? "--report_common" : "" + organism_gene = "" + organism_protein = "" + if (params.amr_organism) { + organism_gene = "-O ${params.amr_organism} --point_mut_all ${amrdir}/${sample}-gene-point-mutations.txt" + organism_protein = "-O ${params.amr_organism} --point_mut_all ${amrdir}/${sample}-protein-point-mutations.txt" + } + template "antimicrobial_resistance.sh" + + stub: + amrdir = "antimicrobial-resistance" + """ + mkdir ${amrdir} + mkdir logs + touch ${amrdir}/${sample} + touch logs/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.genes), + path(params.proteins) + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.amrdb) + ) + antimicrobial_resistance(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect()) +} diff --git a/modules/mash/antimicrobial_resistance/bin/check-staging.py b/modules/mash/antimicrobial_resistance/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/mash/antimicrobial_resistance/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/mash/antimicrobial_resistance/nextflow.config b/modules/mash/antimicrobial_resistance/nextflow.config new file mode 100644 index 000000000..1c27976c4 --- /dev/null +++ b/modules/mash/antimicrobial_resistance/nextflow.config @@ -0,0 +1,47 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: antimicrobial_resistance { + conda = "${baseDir}/../../../conda/envs/antimicrobial_resistance-1.7.x"} + } + } + + docker { + process { + withName: antimicrobial_resistance { + container = "ghcr.io/bactopia/antimicrobial_resistance:1.6.0"} + + } + } + + test { + + process { + withName: antimicrobial_resistance{ + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/mash/antimicrobial_resistance/templates/antimicrobial_resistance.sh b/modules/mash/antimicrobial_resistance/templates/antimicrobial_resistance.sh new file mode 100644 index 000000000..6e15fbc60 --- /dev/null +++ b/modules/mash/antimicrobial_resistance/templates/antimicrobial_resistance.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -e +set -u + +LOG_DIR="logs/!{task.process}" +mkdir -p ${LOG_DIR} + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Verify AWS files were staged +if [[ ! -L "!{genes} " ]]; then + check-staging.py --fq1 !{genes} --fq2 !{proteins} --extra !{amrdb} +fi + +if [[ !{params.compress} == "true" ]]; then + gzip -cd !{genes} > !{sample}.ffn + gzip -cd !{proteins} > !{sample}.faa +fi + +tar -xzvf !{amrdb} +mkdir !{amrdir} + +# amrfinder Version +echo "# amrfinder Version" >> ${LOG_DIR}/!{task.process}.versions +amrfinder --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +amrfinder -n !{sample}.ffn \ + -d amrfinderdb/ \ + -o !{amrdir}/!{sample}-gene-report.txt \ + --ident_min !{params.amr_ident_min} \ + --coverage_min !{params.amr_coverage_min} \ + --translation_table !{params.amr_translation_table} \ + --threads !{task.cpus} !{organism_gene} !{plus} !{report_common} > ${LOG_DIR}/amrfinder-gene.out 2> ${LOG_DIR}/amrfinder-gene.err + +amrfinder -p !{sample}.faa \ + -d amrfinderdb/ \ + -o !{amrdir}/!{sample}-protein-report.txt \ + --ident_min !{params.amr_ident_min} \ + --coverage_min !{params.amr_coverage_min} \ + --translation_table !{params.amr_translation_table} \ + --threads !{task.cpus} !{organism_protein} !{plus} !{report_common} > ${LOG_DIR}/amrfinder-protein.out 2> ${LOG_DIR}/amrfinder-protein.err + +if [[ !{params.compress} == "true" ]]; then + rm !{sample}.faa !{sample}.ffn +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/mash/antimicrobial_resistance/test_params.yaml b/modules/mash/antimicrobial_resistance/test_params.yaml new file mode 100644 index 000000000..d53ad15b2 --- /dev/null +++ b/modules/mash/antimicrobial_resistance/test_params.yaml @@ -0,0 +1,56 @@ +outdir: + "test_output" + +sample: + "TEST_SAMPLE" + +single_end: + "test" + +genes: + "test_data/SRR2838702.ffn" + +proteins: + "test_data/SRR2838702.faa" + +amrdb: + "test_data/amrfinderdb.tar.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +update_amr: + false + +amr_ident_min: + -1 + +amr_coverage_min: + 0.5 + +amr_organism: + null + +amr_translation_table: + 11 + +amr_plus: + false + +amr_report_common: + false + +skip_logs: + false + +compress: + false diff --git a/modules/mash/estimate_genome_size/README.md b/modules/mash/estimate_genome_size/README.md new file mode 100644 index 000000000..3516467cb --- /dev/null +++ b/modules/mash/estimate_genome_size/README.md @@ -0,0 +1,17 @@ +# estimate_genome_size process testing: + +This process estimate a genome size using Mash. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run estimate_genome_size.nf -profile test,docker -params-file test_params.yaml -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by `conda` to test with conda. diff --git a/modules/mash/estimate_genome_size/bin/build-containers.sh b/modules/mash/estimate_genome_size/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/mash/estimate_genome_size/bin/check-assembly-accession.py b/modules/mash/estimate_genome_size/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/mash/estimate_genome_size/bin/check-fastqs.py b/modules/mash/estimate_genome_size/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/mash/estimate_genome_size/bin/check-staging.py b/modules/mash/estimate_genome_size/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/mash/estimate_genome_size/bin/cleanup-coverage.py b/modules/mash/estimate_genome_size/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/mash/estimate_genome_size/bin/create-tool.sh b/modules/mash/estimate_genome_size/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/mash/estimate_genome_size/bin/gh-actions/free-disk-space.sh b/modules/mash/estimate_genome_size/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/mash/estimate_genome_size/bin/gh-actions/setup-bactopia-env.sh b/modules/mash/estimate_genome_size/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/mash/estimate_genome_size/bin/gh-actions/setup-docker-builds.py b/modules/mash/estimate_genome_size/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-build.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-citations.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-datasets.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-prepare.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-pull.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-search.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-summary.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-tools.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/mash/estimate_genome_size/bin/helpers/bactopia-versions.py b/modules/mash/estimate_genome_size/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/mash/estimate_genome_size/bin/mask-consensus.py b/modules/mash/estimate_genome_size/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/mash/estimate_genome_size/bin/merge-blast-json.py b/modules/mash/estimate_genome_size/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/mash/estimate_genome_size/bin/mlst-blast.py b/modules/mash/estimate_genome_size/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/mash/estimate_genome_size/bin/select-references.py b/modules/mash/estimate_genome_size/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/mash/estimate_genome_size/bin/split-coverages.py b/modules/mash/estimate_genome_size/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/mash/estimate_genome_size/bin/update-conda.sh b/modules/mash/estimate_genome_size/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/mash/estimate_genome_size/bin/update-docker.sh b/modules/mash/estimate_genome_size/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/mash/estimate_genome_size/bin/update-tools.sh b/modules/mash/estimate_genome_size/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/mash/estimate_genome_size/bin/update-version.sh b/modules/mash/estimate_genome_size/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/mash/estimate_genome_size/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/mash/estimate_genome_size/estimate_genome_size.nf b/modules/mash/estimate_genome_size/estimate_genome_size.nf new file mode 100644 index 000000000..b6d198ca6 --- /dev/null +++ b/modules/mash/estimate_genome_size/estimate_genome_size.nf @@ -0,0 +1,50 @@ +nextflow.enable.dsl = 2 + +process ESTIMATE_GENOME_SIZE { + /* Estimate the input genome size if not given. */ + tag "${sample}" + + publishDir "${params.outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${params.outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: '*.txt' + + input: + tuple val(sample), val(sample_type), val(single_end), path(fq), path(extra) + + output: + path "${sample}-genome-size-error.txt" optional true + path("${sample}-genome-size.txt") optional true + tuple val(sample), val(sample_type), val(single_end), + path("fastqs/${sample}*.fastq.gz"), path(extra), path("${sample}-genome-size.txt"),emit: QUALITY_CONTROL, optional: true + path "${task.process}/*" optional true + + shell: + genome_size = SPECIES_GENOME_SIZE + + template "estimate_genome_size.sh" + + stub: + """ + mkdir fastqs + mkdir ${task.process} + touch ${sample}-genome-size-error.txt + touch ${sample}-genome-size.txt + touch fastqs/${sample}.fastq.gz + touch ${task.process}/* + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + path(params.fq), + path(params.extra) + ]) + + estimate_genome_size(TEST_PARAMS_CH) +} diff --git a/modules/mash/estimate_genome_size/nextflow.config b/modules/mash/estimate_genome_size/nextflow.config new file mode 100644 index 000000000..5aa9cf87d --- /dev/null +++ b/modules/mash/estimate_genome_size/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: estimate_genome_size { + conda = "${baseDir}/../../../conda/envs/minmers-1.7.x"} + } + } + + docker { + process { + withName: estimate_genome_size { + container = "ghcr.io/bactopia/minmers:1.6.0"} + + } + } + + test { + process { + echo = true + withName: estimate_genome_size { + cpus = 2 + queue = 'long' + } + + } + env { + SPECIES_GENOME_SIZE = "null" + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} \ No newline at end of file diff --git a/modules/mash/estimate_genome_size/templates/estimate_genome_size.sh b/modules/mash/estimate_genome_size/templates/estimate_genome_size.sh new file mode 100644 index 000000000..63b6917c1 --- /dev/null +++ b/modules/mash/estimate_genome_size/templates/estimate_genome_size.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -e +set -u +OUTPUT="!{sample}-genome-size.txt" +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{extra} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{extra} + fi +fi + +if [ "!{genome_size}" == "null" ]; then + # Use mash to estimate the genome size, if a genome size cannot be + # estimated set the genome size to 0 + echo "# Mash Version" >> ${LOG_DIR}/!{task.process}.versions + mash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + if [ "!{single_end}" == "false" ]; then + mash sketch -o test -k 31 -m 3 -r !{fq[0]} !{fq[1]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 3 !{fq[0]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + ESTIMATED_GENOME_SIZE=`head -n1 ${OUTPUT}` + + if [ ${ESTIMATED_GENOME_SIZE} -gt "!{params.max_genome_size}" ]; then + # Probably high coverage, try increasing number of kmer copies to 10 + if [ "!{single_end}" == "false" ]; then + mash sketch -o test -k 31 -m 10 -r !{fq[0]} !{fq[1]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 10 !{fq[0]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + elif [ ${ESTIMATED_GENOME_SIZE} -lt "!{params.min_genome_size}" ]; then + # Probably low coverage, try decreasing the number of kmer copies to 1 + if [ "!{single_end}" == "false" ]; then + mash sketch -o test -k 31 -m 1 -r !{fq[0]} !{fq[1]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 1 !{fq[0]} 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + fi + + ESTIMATED_GENOME_SIZE=`head -n1 ${OUTPUT}` + if [ ${ESTIMATED_GENOME_SIZE} -gt "!{params.max_genome_size}" ]; then + rm ${OUTPUT} + echo "!{sample} estimated genome size (${ESTIMATED_GENOME_SIZE} bp) exceeds the maximum + allowed genome size (!{params.max_genome_size} bp). If this is unexpected, please + investigate !{sample} to determine a cause (e.g. metagenomic, contaminants, etc...). + Otherwise, adjust the --max_genome_size parameter to fit your need. Further analysis + of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-genome-size-error.txt + elif [ ${ESTIMATED_GENOME_SIZE} -lt "!{params.min_genome_size}" ]; then + rm ${OUTPUT} + echo "!{sample} estimated genome size (${ESTIMATED_GENOME_SIZE} bp) is less than the minimum + allowed genome size (!{params.min_genome_size} bp). If this is unexpected, please + investigate !{sample} to determine a cause (e.g. metagenomic, contaminants, etc...). + Otherwise, adjust the --min_genome_size parameter to fit your need. Further analysis + of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-genome-size-error.txt + fi +else + # Use the genome size given by the user. (Should be >= 0) + echo "!{genome_size}" > ${OUTPUT} +fi + +# pass along FASTQs +mkdir -p fastqs +if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}.fastq.gz + fi +else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} fastqs/!{sample}_R1.fastq.gz + cp !{fq[1]} fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} fastqs/!{sample}.fastq.gz + fi +fi + + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/mash/estimate_genome_size/test_params.yaml b/modules/mash/estimate_genome_size/test_params.yaml new file mode 100644 index 000000000..1a97edbfb --- /dev/null +++ b/modules/mash/estimate_genome_size/test_params.yaml @@ -0,0 +1,38 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + "false" + +sample_type: + "paired-end" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +extra: + "test_data/empty.fna.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +max_genome_size: + '18040666' + +min_genome_size: + '100000' + +skip_logs: + false diff --git a/modules/mash/estimate_genome_size/work/d8/5c04f254356b7f34402bdeb7477f57/test:estimate_genome_size/test:estimate_genome_size.sh b/modules/mash/estimate_genome_size/work/d8/5c04f254356b7f34402bdeb7477f57/test:estimate_genome_size/test:estimate_genome_size.sh new file mode 100644 index 000000000..3c7a45cc1 --- /dev/null +++ b/modules/mash/estimate_genome_size/work/d8/5c04f254356b7f34402bdeb7477f57/test:estimate_genome_size/test:estimate_genome_size.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -e +set -u +OUTPUT="SRR2838702-genome-size.txt" +LOG_DIR="test:estimate_genome_size" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/test:estimate_genome_size.versions +date --iso-8601=seconds >> ${LOG_DIR}/test:estimate_genome_size.versions + +# Verify AWS files were staged +if [[ ! -L "input.1" ]]; then + if [ "false" == "true" ]; then + check-staging.py --fq1 input.1 --extra input.2 --is_single + else + check-staging.py --fq1 input.1 --fq2 null --extra input.2 + fi +fi + +if [ "1" == "null" ]; then + # Use mash to estimate the genome size, if a genome size cannot be + # estimated set the genome size to 0 + echo "# Mash Version" >> ${LOG_DIR}/test:estimate_genome_size.versions + mash --version >> ${LOG_DIR}/test:estimate_genome_size.versions 2>&1 + if [ "false" == "false" ]; then + mash sketch -o test -k 31 -m 3 -r input.1 null 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 3 input.1 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + ESTIMATED_GENOME_SIZE=`head -n1 ${OUTPUT}` + + if [ ${ESTIMATED_GENOME_SIZE} -gt "18040666" ]; then + # Probably high coverage, try increasing number of kmer copies to 10 + if [ "false" == "false" ]; then + mash sketch -o test -k 31 -m 10 -r input.1 null 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 10 input.1 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + elif [ ${ESTIMATED_GENOME_SIZE} -lt "100000" ]; then + # Probably low coverage, try decreasing the number of kmer copies to 1 + if [ "false" == "false" ]; then + mash sketch -o test -k 31 -m 1 -r input.1 null 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + else + mash sketch -o test -k 31 -m 1 input.1 2>&1 | \ + grep "Estimated genome size:" | \ + awk '{if($4){printf("%d\n", $4)}} END {if (!NR) print "0"}' > ${OUTPUT} + fi + rm -rf test.msh + fi + + ESTIMATED_GENOME_SIZE=`head -n1 ${OUTPUT}` + if [ ${ESTIMATED_GENOME_SIZE} -gt "18040666" ]; then + rm ${OUTPUT} + echo "SRR2838702 estimated genome size (${ESTIMATED_GENOME_SIZE} bp) exceeds the maximum + allowed genome size (18040666 bp). If this is unexpected, please + investigate SRR2838702 to determine a cause (e.g. metagenomic, contaminants, etc...). + Otherwise, adjust the --max_genome_size parameter to fit your need. Further analysis + of SRR2838702 will be discontinued." | \ + sed 's/^\s*//' > SRR2838702-genome-size-error.txt + elif [ ${ESTIMATED_GENOME_SIZE} -lt "100000" ]; then + rm ${OUTPUT} + echo "SRR2838702 estimated genome size (${ESTIMATED_GENOME_SIZE} bp) is less than the minimum + allowed genome size (100000 bp). If this is unexpected, please + investigate SRR2838702 to determine a cause (e.g. metagenomic, contaminants, etc...). + Otherwise, adjust the --min_genome_size parameter to fit your need. Further analysis + of SRR2838702 will be discontinued." | \ + sed 's/^\s*//' > SRR2838702-genome-size-error.txt + fi +else + # Use the genome size given by the user. (Should be >= 0) + echo "1" > ${OUTPUT} +fi + +# pass along FASTQs +mkdir -p fastqs +if [[ -L "input.1" ]]; then + if [ "false" == "false" ]; then + # Paired-End Reads + ln -s `readlink input.1` fastqs/SRR2838702_R1.fastq.gz + ln -s `readlink null` fastqs/SRR2838702_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink input.1` fastqs/SRR2838702.fastq.gz + fi +else + if [ "false" == "false" ]; then + # Paired-End Reads + cp input.1 fastqs/SRR2838702_R1.fastq.gz + cp null fastqs/SRR2838702_R2.fastq.gz + else + # Single-End Reads + cp input.1 fastqs/SRR2838702.fastq.gz + fi +fi + + +if [ "false" == "false" ]; then + cp .command.err ${LOG_DIR}/test:estimate_genome_size.err + cp .command.out ${LOG_DIR}/test:estimate_genome_size.out + cp .command.sh ${LOG_DIR}/test:estimate_genome_size.sh || : + cp .command.trace ${LOG_DIR}/test:estimate_genome_size.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/mccortex/count_31mers/README.md b/modules/mccortex/count_31mers/README.md new file mode 100644 index 000000000..626041ae6 --- /dev/null +++ b/modules/mccortex/count_31mers/README.md @@ -0,0 +1,17 @@ +# count_31mers process testing: + +This process count 31mers in the reads using McCortex + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run count_31mers.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/mccortex/count_31mers/bin/build-containers.sh b/modules/mccortex/count_31mers/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/mccortex/count_31mers/bin/check-assembly-accession.py b/modules/mccortex/count_31mers/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/mccortex/count_31mers/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/mccortex/count_31mers/bin/check-fastqs.py b/modules/mccortex/count_31mers/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/mccortex/count_31mers/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/mccortex/count_31mers/bin/check-staging.py b/modules/mccortex/count_31mers/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/mccortex/count_31mers/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/mccortex/count_31mers/bin/cleanup-coverage.py b/modules/mccortex/count_31mers/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/mccortex/count_31mers/bin/create-tool.sh b/modules/mccortex/count_31mers/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/mccortex/count_31mers/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/mccortex/count_31mers/bin/gh-actions/free-disk-space.sh b/modules/mccortex/count_31mers/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/mccortex/count_31mers/bin/gh-actions/setup-bactopia-env.sh b/modules/mccortex/count_31mers/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/mccortex/count_31mers/bin/gh-actions/setup-docker-builds.py b/modules/mccortex/count_31mers/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-build.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-citations.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-datasets.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-prepare.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-pull.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-search.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-summary.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-tools.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/mccortex/count_31mers/bin/helpers/bactopia-versions.py b/modules/mccortex/count_31mers/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/mccortex/count_31mers/bin/mask-consensus.py b/modules/mccortex/count_31mers/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/mccortex/count_31mers/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/mccortex/count_31mers/bin/merge-blast-json.py b/modules/mccortex/count_31mers/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/mccortex/count_31mers/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/mccortex/count_31mers/bin/mlst-blast.py b/modules/mccortex/count_31mers/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/mccortex/count_31mers/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/mccortex/count_31mers/bin/select-references.py b/modules/mccortex/count_31mers/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/mccortex/count_31mers/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/mccortex/count_31mers/bin/split-coverages.py b/modules/mccortex/count_31mers/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/mccortex/count_31mers/bin/update-conda.sh b/modules/mccortex/count_31mers/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/mccortex/count_31mers/bin/update-docker.sh b/modules/mccortex/count_31mers/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/mccortex/count_31mers/bin/update-tools.sh b/modules/mccortex/count_31mers/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/mccortex/count_31mers/bin/update-version.sh b/modules/mccortex/count_31mers/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/mccortex/count_31mers/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/mccortex/count_31mers/count_31mers.nf b/modules/mccortex/count_31mers/count_31mers.nf new file mode 100644 index 000000000..0e28e108e --- /dev/null +++ b/modules/mccortex/count_31mers/count_31mers.nf @@ -0,0 +1,41 @@ +nextflow.enable.dsl = 2 + +process COUNT_31MERS { + /* Count 31mers in the reads using McCortex */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/kmers", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "*.ctx" + + input: + tuple val(sample), val(single_end), path(fq) + output: + path "${sample}.ctx" + path "${task.process}/*" optional true + + shell: + m = task.memory.toString().split(' ')[0].toInteger() * 1000 - 500 + template "count_31mers.sh" + + stub: + """ + mkdir ${task.process} + touch ${sample}.ctx + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq) + ]) + + count_31mers(TEST_PARAMS_CH) +} diff --git a/modules/mccortex/count_31mers/nextflow.config b/modules/mccortex/count_31mers/nextflow.config new file mode 100644 index 000000000..cb131ce98 --- /dev/null +++ b/modules/mccortex/count_31mers/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: count_31mers { + conda = "${baseDir}/../../../conda/envs/count_31mers-1.6.x"} + } + } + + docker { + process { + withName: count_31mers { + container = "ghcr.io/bactopia/count_31mers:1.6.0"} + + } + } + test { + process.ext.template = {"${task.process}.sh"} + process { + echo = true + withName: count_31mers{ + cpus = 2 + memory = "2 GB" + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/mccortex/count_31mers/templates/count_31mers.sh b/modules/mccortex/count_31mers/templates/count_31mers.sh new file mode 100644 index 000000000..6436d373e --- /dev/null +++ b/modules/mccortex/count_31mers/templates/count_31mers.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# mccortex31 Version" >> ${LOG_DIR}/!{task.process}.versions +mccortex31 2>&1 | grep "version" >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} + fi +fi + +if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + mccortex31 build -f -k 31 -s !{sample} -2 !{fq[0]}:!{fq[1]} -t !{task.cpus} -m !{m}mb -q temp_counts +else + # Single-End Reads + mccortex31 build -f -k 31 -s !{sample} -1 !{fq[0]} -t !{task.cpus} -m !{m}mb -q temp_counts +fi + +if [ "!{params.keep_singletons}" == "false" ]; then + # Clean up Cortex file (mostly remove singletons) + mccortex31 clean -q -B 2 -U2 -T2 -m !{m}mb -o !{sample}.ctx temp_counts + rm temp_counts +else + mv temp_counts !{sample}.ctx +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/mccortex/count_31mers/test_params.yaml b/modules/mccortex/count_31mers/test_params.yaml new file mode 100644 index 000000000..df097f03a --- /dev/null +++ b/modules/mccortex/count_31mers/test_params.yaml @@ -0,0 +1,35 @@ +genome_size: + "test_data/genome-size.txt" + +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +cortex_ram: + 8 + +keep_singletons: + false + +skip_logs: + false diff --git a/modules/minmer/minmer_query/README.md b/modules/minmer/minmer_query/README.md new file mode 100644 index 000000000..e7ffbe67b --- /dev/null +++ b/modules/minmer/minmer_query/README.md @@ -0,0 +1,17 @@ +# minmer_query process testing: + +This process queries minmer sketches against pre-computed RefSeq (Mash, k=21) and GenBank (Sourmash, k=21,31,51) + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run minmer_query.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/minmer/minmer_query/bin/build-containers.sh b/modules/minmer/minmer_query/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/minmer/minmer_query/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/minmer/minmer_query/bin/check-assembly-accession.py b/modules/minmer/minmer_query/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/minmer/minmer_query/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/minmer/minmer_query/bin/check-fastqs.py b/modules/minmer/minmer_query/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/minmer/minmer_query/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/minmer/minmer_query/bin/check-staging.py b/modules/minmer/minmer_query/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/minmer/minmer_query/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/minmer/minmer_query/bin/cleanup-coverage.py b/modules/minmer/minmer_query/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/minmer/minmer_query/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/minmer/minmer_query/bin/create-tool.sh b/modules/minmer/minmer_query/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/minmer/minmer_query/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/minmer/minmer_query/bin/gh-actions/free-disk-space.sh b/modules/minmer/minmer_query/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/minmer/minmer_query/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/minmer/minmer_query/bin/gh-actions/setup-bactopia-env.sh b/modules/minmer/minmer_query/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/minmer/minmer_query/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/minmer/minmer_query/bin/gh-actions/setup-docker-builds.py b/modules/minmer/minmer_query/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/minmer/minmer_query/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-build.py b/modules/minmer/minmer_query/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-citations.py b/modules/minmer/minmer_query/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-datasets.py b/modules/minmer/minmer_query/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-prepare.py b/modules/minmer/minmer_query/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-pull.py b/modules/minmer/minmer_query/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-search.py b/modules/minmer/minmer_query/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-summary.py b/modules/minmer/minmer_query/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-tools.py b/modules/minmer/minmer_query/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/minmer/minmer_query/bin/helpers/bactopia-versions.py b/modules/minmer/minmer_query/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/minmer/minmer_query/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/minmer/minmer_query/bin/mask-consensus.py b/modules/minmer/minmer_query/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/minmer/minmer_query/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/minmer/minmer_query/bin/merge-blast-json.py b/modules/minmer/minmer_query/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/minmer/minmer_query/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/minmer/minmer_query/bin/mlst-blast.py b/modules/minmer/minmer_query/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/minmer/minmer_query/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/minmer/minmer_query/bin/select-references.py b/modules/minmer/minmer_query/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/minmer/minmer_query/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/minmer/minmer_query/bin/split-coverages.py b/modules/minmer/minmer_query/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/minmer/minmer_query/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/minmer/minmer_query/bin/update-conda.sh b/modules/minmer/minmer_query/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/minmer/minmer_query/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/minmer/minmer_query/bin/update-docker.sh b/modules/minmer/minmer_query/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/minmer/minmer_query/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/minmer/minmer_query/bin/update-tools.sh b/modules/minmer/minmer_query/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/minmer/minmer_query/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/minmer/minmer_query/bin/update-version.sh b/modules/minmer/minmer_query/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/minmer/minmer_query/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/minmer/minmer_query/minmer_query.nf b/modules/minmer/minmer_query/minmer_query.nf new file mode 100644 index 000000000..6c70c8f2f --- /dev/null +++ b/modules/minmer/minmer_query/minmer_query.nf @@ -0,0 +1,52 @@ +nextflow.enable.dsl = 2 + +process MINMER_QUERY { + /* + Query minmer sketches against pre-computed RefSeq (Mash, k=21) and + GenBank (Sourmash, k=21,31,51) + */ + tag "${sample} - ${dataset_name}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/minmers", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "*.txt" + + input: + tuple val(sample), val(single_end), path(fq), path(sourmash) + each path(dataset) + + output: + path "*.txt" + path "${task.process}/*" optional true + + when: + MINMER_DATABASES.isEmpty() == false + + shell: + dataset_name = dataset.getName() + mash_w = params.screen_w ? "-w" : "" + fastq = single_end ? fq[0] : "${fq[0]} ${fq[1]}" + template "minmer_query.sh" + + stub: + dataset_name = dataset.getName() + """ + mkdir ${task.process} + touch ${sample}.txt + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + path(params.sourmash) + ]) + TEST_PARAMS_CH2 = Channel.of(path(params.k21),path(params.k31),path(params.k51),path(params.refseqk21)) + minmer_query(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect()) +} diff --git a/modules/minmer/minmer_query/nextflow.config b/modules/minmer/minmer_query/nextflow.config new file mode 100644 index 000000000..5a492fb5e --- /dev/null +++ b/modules/minmer/minmer_query/nextflow.config @@ -0,0 +1,47 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: minmer_query { + conda = "${baseDir}/../../../conda/envs/minmers-1.7.x"} + } + } + + docker { + process { + withName: minmer_query { + container = "ghcr.io/bactopia/minmers:1.6.0"} + + } + } + test { + + process { + withName: minmer_query { + cpus = 2 + queue = 'long' + } + + } + env { + MINMER_DATABASES = ["21","31","51"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + + } +} diff --git a/modules/minmer/minmer_query/templates/minmer_query.sh b/modules/minmer/minmer_query/templates/minmer_query.sh new file mode 100644 index 000000000..b12ec2c33 --- /dev/null +++ b/modules/minmer/minmer_query/templates/minmer_query.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}/!{dataset_name}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{sourmash} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{sourmash} + fi +fi + +if [ "!{dataset_name}" == "refseq-k21-s1000.msh" ]; then + echo "# Mash Version" >> ${LOG_DIR}/!{task.process}.versions + mash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + printf "identity\tshared-hashes\tmedian-multiplicity\tp-value\tquery-ID\tquery-comment\n" > !{sample}-refseq-k21.txt + gzip -cd !{fastq} | \ + mash screen !{mash_w} -i !{params.screen_i} -p !{task.cpus} !{dataset} - | \ + sort -gr >> !{sample}-refseq-k21.txt 2> ${LOG_DIR}/mash.err +elif [ "!{dataset_name}" == "plsdb.msh" ]; then + echo "# Mash Version" >> ${LOG_DIR}/!{task.process}.versions + mash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + printf "identity\tshared-hashes\tmedian-multiplicity\tp-value\tquery-ID\tquery-comment\n" > !{sample}-plsdb-k21.txt + gzip -cd !{fastq} | \ + mash screen !{mash_w} -i !{params.screen_i} -p !{task.cpus} !{dataset} - | \ + sort -gr >> !{sample}-plsdb-k21.txt 2> ${LOG_DIR}/mash.err +elif [ "!{dataset_name}" == "genbank-k21.json.gz" ]; then + echo "# Sourmash Version" >> ${LOG_DIR}/!{task.process}.versions + sourmash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + sourmash lca classify --query !{sourmash} --db !{dataset} > !{sample}-genbank-k21.txt 2> ${LOG_DIR}/sourmash.err +elif [ "!{dataset_name}" == "genbank-k31.json.gz" ]; then + echo "# Sourmash Version" >> ${LOG_DIR}/!{task.process}.versions + sourmash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + sourmash lca classify --query !{sourmash} --db !{dataset} > !{sample}-genbank-k31.txt 2> ${LOG_DIR}/sourmash.err +else + echo "# Sourmash Version" >> ${LOG_DIR}/!{task.process}.versions + sourmash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + sourmash lca classify --query !{sourmash} --db !{dataset} > !{sample}-genbank-k51.txt 2> ${LOG_DIR}/sourmash.err +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi + diff --git a/modules/minmer/minmer_query/test_params.yaml b/modules/minmer/minmer_query/test_params.yaml new file mode 100644 index 000000000..de4986186 --- /dev/null +++ b/modules/minmer/minmer_query/test_params.yaml @@ -0,0 +1,50 @@ +outdir: + "test_output" + +sample: + "TEST_SAMPLE" + +single_end: + "SRR2838702" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +sourmash: + "test_data/SRR2838702.sig" + +k21: + "test_data/genbank-k21.json.gz" + +k31: + "test_data/genbank-k31.json.gz" + +k51: + "test_data/genbank-k51.json.gz" + +refseqk21: + "test_data/refseq-k21-s1000.msh" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +skip_logs: + false + +overwrite: + false + +screen_w: + true + +screen_i: + 0.8 + +minmer_ram: + 2 diff --git a/modules/minmer/minmer_sketch/README.md b/modules/minmer/minmer_sketch/README.md new file mode 100644 index 000000000..5bfd961f2 --- /dev/null +++ b/modules/minmer/minmer_sketch/README.md @@ -0,0 +1,17 @@ +# minmer_sketch process testing: + +This process creates minmer sketches of the input FASTQs using Mash (k=21,31) and Sourmash (k=21,31,51) + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run minmer_sketch.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/minmer/minmer_sketch/bin/build-containers.sh b/modules/minmer/minmer_sketch/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/minmer/minmer_sketch/bin/check-assembly-accession.py b/modules/minmer/minmer_sketch/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/minmer/minmer_sketch/bin/check-fastqs.py b/modules/minmer/minmer_sketch/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/minmer/minmer_sketch/bin/check-staging.py b/modules/minmer/minmer_sketch/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/minmer/minmer_sketch/bin/cleanup-coverage.py b/modules/minmer/minmer_sketch/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/minmer/minmer_sketch/bin/create-tool.sh b/modules/minmer/minmer_sketch/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/minmer/minmer_sketch/bin/gh-actions/free-disk-space.sh b/modules/minmer/minmer_sketch/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/minmer/minmer_sketch/bin/gh-actions/setup-bactopia-env.sh b/modules/minmer/minmer_sketch/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/minmer/minmer_sketch/bin/gh-actions/setup-docker-builds.py b/modules/minmer/minmer_sketch/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-build.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-citations.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-datasets.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-prepare.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-pull.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-search.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-summary.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-tools.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/minmer/minmer_sketch/bin/helpers/bactopia-versions.py b/modules/minmer/minmer_sketch/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/minmer/minmer_sketch/bin/mask-consensus.py b/modules/minmer/minmer_sketch/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/minmer/minmer_sketch/bin/merge-blast-json.py b/modules/minmer/minmer_sketch/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/minmer/minmer_sketch/bin/mlst-blast.py b/modules/minmer/minmer_sketch/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/minmer/minmer_sketch/bin/select-references.py b/modules/minmer/minmer_sketch/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/minmer/minmer_sketch/bin/split-coverages.py b/modules/minmer/minmer_sketch/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/minmer/minmer_sketch/bin/update-conda.sh b/modules/minmer/minmer_sketch/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/minmer/minmer_sketch/bin/update-docker.sh b/modules/minmer/minmer_sketch/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/minmer/minmer_sketch/bin/update-tools.sh b/modules/minmer/minmer_sketch/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/minmer/minmer_sketch/bin/update-version.sh b/modules/minmer/minmer_sketch/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/minmer/minmer_sketch/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/minmer/minmer_sketch/minmer_sketch.nf b/modules/minmer/minmer_sketch/minmer_sketch.nf new file mode 100644 index 000000000..bf0fc34f4 --- /dev/null +++ b/modules/minmer/minmer_sketch/minmer_sketch.nf @@ -0,0 +1,50 @@ +nextflow.enable.dsl = 2 + +process MINMER_SKETCH { + /* + Create minmer sketches of the input FASTQs using Mash (k=21,31) and + Sourmash (k=21,31,51) + */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/minmers", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "*.{msh,sig}" + + input: + tuple val(sample), val(single_end), path(fq) + + output: + path("${sample}*.{msh,sig}") + tuple val(sample), val(single_end), path("fastqs/${sample}*.fastq.gz"), path("${sample}.sig"),emit: MINMER_QUERY + tuple val(sample), val(single_end), path("fastqs/${sample}*.fastq.gz"), path("${sample}-k31.msh"),emit: DOWNLOAD_REFERENCES + path "${task.process}/*" optional true + + shell: + fastq = single_end ? fq[0] : "${fq[0]} ${fq[1]}" + template "minmer_sketch.sh" + + stub: + """ + mkdir fastqs + mkdir ${task.process} + touch fastqs/${sample}.fastq.gz + touch ${task.process}/${sample} + touch ${sample}.sig + touch ${sample}-k31.msh + + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq) + ]) + + minmer_sketch(TEST_PARAMS_CH) +} diff --git a/modules/minmer/minmer_sketch/nextflow.config b/modules/minmer/minmer_sketch/nextflow.config new file mode 100644 index 000000000..160e228d2 --- /dev/null +++ b/modules/minmer/minmer_sketch/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: minmer_sketch { + conda = "${baseDir}/../../../conda/envs/minmers-1.7.x"} + } + } + + docker { + process { + withName: minmer_sketch { + container = "ghcr.io/bactopia/minmers:1.6.0"} + + } + } + + test { + + process { + + withName: minmer_sketch { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/minmer/minmer_sketch/templates/minmer_sketch.sh b/modules/minmer/minmer_sketch/templates/minmer_sketch.sh new file mode 100644 index 000000000..57ff8c917 --- /dev/null +++ b/modules/minmer/minmer_sketch/templates/minmer_sketch.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# Mash Version" >> ${LOG_DIR}/!{task.process}.versions +mash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# Sourmash Version" >> ${LOG_DIR}/!{task.process}.versions +sourmash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} + fi +fi + +gzip -cd !{fastq} | mash sketch -o !{sample}-k21 -k 21 -s !{params.mash_sketch} -r -I !{sample} - +gzip -cd !{fastq} | mash sketch -o !{sample}-k31 -k 31 -s !{params.mash_sketch} -r -I !{sample} - +sourmash sketch dna -p k=21,k=31,k=51,abund,scaled=!{params.sourmash_scale} --merge !{sample} -o !{sample}.sig !{fastq} + +# pass the FASTQs along +mkdir -p fastqs +if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}.fastq.gz + fi +else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} fastqs/!{sample}_R1.fastq.gz + cp !{fq[1]} fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} fastqs/!{sample}.fastq.gz + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi + diff --git a/modules/minmer/minmer_sketch/test_params.yaml b/modules/minmer/minmer_sketch/test_params.yaml new file mode 100644 index 000000000..c8c6ef97f --- /dev/null +++ b/modules/minmer/minmer_sketch/test_params.yaml @@ -0,0 +1,32 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +mash_sketch: + 10000 + +sourmash_scale: + 10000 + +skip_logs: + false diff --git a/modules/prokka/annotate_genome/README.md b/modules/prokka/annotate_genome/README.md new file mode 100644 index 000000000..4f86eba74 --- /dev/null +++ b/modules/prokka/annotate_genome/README.md @@ -0,0 +1,17 @@ +# annotate process testing: + +This process annotate the assembly using Prokka, use a proteins FASTA if available + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run annotate_genome.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/prokka/annotate_genome/annotate_genome.nf b/modules/prokka/annotate_genome/annotate_genome.nf new file mode 100644 index 000000000..1a8999737 --- /dev/null +++ b/modules/prokka/annotate_genome/annotate_genome.nf @@ -0,0 +1,98 @@ +nextflow.enable.dsl = 2 + +process ANNOTATE_GENOME { + /* Annotate the assembly using Prokka, use a proteins FASTA if available */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "annotation/${sample}*" + + input: + tuple val(sample), val(single_end), file(fq), file(fasta), file(total_contigs) + file prokka_proteins + file prodigal_tf + + output: + file "annotation/${sample}*" + tuple val(sample), file("annotation/${sample}.{ffn,ffn.gz}"),emit: PLASMID_BLAST,optional: true + tuple val(sample), + file("annotation/${sample}.{ffn,ffn.gz}"), + file("annotation/${sample}.{faa,faa.gz}"),emit: ANTIMICROBIAL_RESISTANCE, optional: true + file "${task.process}/*" optional true + + shell: + gunzip_fasta = fasta.getName().replace('.gz', '') + contig_count = total_contigs.getName().replace('total_contigs_', '') + genus = "Genus" + species = "species" + proteins = "" + if (prokka_proteins.getName() != 'EMPTY_PROTEINS') { + proteins = "--proteins ${prokka_proteins}" + if (SPECIES.contains("-")) { + genus = SPECIES.split('-')[0].capitalize() + species = SPECIES.split('-')[1] + } else { + genus = SPECIES.capitalize() + species = "spp." + } + } + + prodigal = "" + if (prodigal_tf.getName() != 'EMPTY_TF' && !params.skip_prodigal_tf) { + prodigal = "--prodigaltf ${prodigal_tf}" + } + + compliant = params.compliant ? "--compliant" : "" + locustag = "--locustag ${sample}" + renamed = false + // Contig ID must <= 37 characters + if ("gnl|${params.centre}|${sample}_${contig_count}".length() > 37) { + locustag = "" + compliant = "--compliant" + renamed = true + } + addgenes = params.nogenes ? "" : "--addgenes" + addmrna = params.addmrna ? "--addmrna" : "" + rawproduct = params.rawproduct ? "--rawproduct" : "" + cdsrnaolap = params.cdsrnaolap ? "--cdsrnaolap" : "" + norrna = params.norrna ? "--norrna" : "" + notrna = params.notrna ? "--notrna" : "" + rnammer = params.rnammer ? "--rnammer" : "" + rfam = params.rnammer ? "--rfam" : "" + template "annotate_genome.sh" + + stub: + """ + mkdir annotation + mkdir ${task.process} + touch annotation/${sample} + touch annotation/${sample}.ffn + touch annotation/${sample}.ffn.gz + touch annotation/${sample}.faa + touch annotation/${sample}.faa.gz + touch "${task.process}/${sample}" + """ +} + + +//############### +//Module testing +//############### + +workflow test{ + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + file(params.fq), + file(params.fasta), + file(params.total_contigs) + ]) + TEST_PARAMS_CH2 = Channel.of( + file(params.prokka_proteins) + ) + TEST_PARAMS_CH3 = Channel.of( + file(params.prodigal_tf) + ) + + annotate_genome(TEST_PARAMS_CH,TEST_PARAMS_CH2,TEST_PARAMS_CH3) +} diff --git a/modules/prokka/annotate_genome/bin/build-containers.sh b/modules/prokka/annotate_genome/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/prokka/annotate_genome/bin/check-assembly-accession.py b/modules/prokka/annotate_genome/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/prokka/annotate_genome/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/prokka/annotate_genome/bin/check-fastqs.py b/modules/prokka/annotate_genome/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/prokka/annotate_genome/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/prokka/annotate_genome/bin/check-staging.py b/modules/prokka/annotate_genome/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/prokka/annotate_genome/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/prokka/annotate_genome/bin/cleanup-coverage.py b/modules/prokka/annotate_genome/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/prokka/annotate_genome/bin/create-tool.sh b/modules/prokka/annotate_genome/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/prokka/annotate_genome/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/prokka/annotate_genome/bin/gh-actions/free-disk-space.sh b/modules/prokka/annotate_genome/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/prokka/annotate_genome/bin/gh-actions/setup-bactopia-env.sh b/modules/prokka/annotate_genome/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/prokka/annotate_genome/bin/gh-actions/setup-docker-builds.py b/modules/prokka/annotate_genome/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-build.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-citations.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-datasets.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-prepare.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-pull.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-search.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-summary.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-tools.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/prokka/annotate_genome/bin/helpers/bactopia-versions.py b/modules/prokka/annotate_genome/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/prokka/annotate_genome/bin/mask-consensus.py b/modules/prokka/annotate_genome/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/prokka/annotate_genome/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/prokka/annotate_genome/bin/merge-blast-json.py b/modules/prokka/annotate_genome/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/prokka/annotate_genome/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/prokka/annotate_genome/bin/mlst-blast.py b/modules/prokka/annotate_genome/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/prokka/annotate_genome/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/prokka/annotate_genome/bin/select-references.py b/modules/prokka/annotate_genome/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/prokka/annotate_genome/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/prokka/annotate_genome/bin/split-coverages.py b/modules/prokka/annotate_genome/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/prokka/annotate_genome/bin/update-conda.sh b/modules/prokka/annotate_genome/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/prokka/annotate_genome/bin/update-docker.sh b/modules/prokka/annotate_genome/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/prokka/annotate_genome/bin/update-tools.sh b/modules/prokka/annotate_genome/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/prokka/annotate_genome/bin/update-version.sh b/modules/prokka/annotate_genome/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/prokka/annotate_genome/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/prokka/annotate_genome/nextflow.config b/modules/prokka/annotate_genome/nextflow.config new file mode 100644 index 000000000..93e272240 --- /dev/null +++ b/modules/prokka/annotate_genome/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: annotate_genome { + conda = "${baseDir}/../../../conda/envs/annotate_genome-1.7.x"} + } + } + + docker { + process { + withName: annotate_genome { + container = "ghcr.io/bactopia/annotate_genome:1.6.0"} + + } + } + + test { + process { + echo = true + withName: annotate_genome { + cpus = 2 + queue = 'long' + } + + } + env { + SPECIES = "Escherichia-coli" + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/prokka/annotate_genome/templates/annotate_genome.sh b/modules/prokka/annotate_genome/templates/annotate_genome.sh new file mode 100644 index 000000000..f46279d5f --- /dev/null +++ b/modules/prokka/annotate_genome/templates/annotate_genome.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR}/ + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +if [[ !{params.compress} == "true" ]]; then + gunzip -f !{fasta} +fi + +if [ "!{renamed}" == "true" ]; then + echo "Original sample name (!{sample}) not used due to creating a contig ID >37 characters" +fi + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --assembly !{gunzip_fasta} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --assembly !{gunzip_fasta} + fi +fi + +# Prokka Version +echo "# Prokka Version" >> ${LOG_DIR}/!{task.process}.versions +prokka --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +prokka --outdir annotation \ + --force \ + --prefix '!{sample}' \ + --genus '!{genus}' \ + --species '!{species}' \ + --evalue '!{params.prokka_evalue}' \ + --coverage !{params.prokka_coverage} \ + --cpus !{task.cpus} \ + --centre '!{params.centre}' \ + --mincontiglen !{params.min_contig_len} \ + !{locustag} \ + !{prodigal} \ + !{addgenes} \ + !{compliant} \ + !{proteins} \ + !{rawproduct} \ + !{cdsrnaolap} \ + !{addmrna} \ + !{norrna} \ + !{notrna} \ + !{rnammer} \ + !{rfam} \ + !{gunzip_fasta} > ${LOG_DIR}/prokka.out 2> ${LOG_DIR}/prokka.err + +if [[ !{params.compress} == "true" ]]; then + find annotation/ -type f -not -name "*.txt" -and -not -name "*.log*" | \ + xargs -I {} pigz -n --best -p !{task.cpus} {} +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/prokka/annotate_genome/test_params.yaml b/modules/prokka/annotate_genome/test_params.yaml new file mode 100644 index 000000000..a723ccbed --- /dev/null +++ b/modules/prokka/annotate_genome/test_params.yaml @@ -0,0 +1,87 @@ +species: + "Escherichia coli" + + +outdir: + "test_output" + +sample: + "TEST_SAMPLE" + +sample_type: + "paired-end" + +single_end: + "test" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +fasta: + "test_data/SRR2838702.fna" + +total_contigs: + "test_data/total_contigs" + +prokka_proteins: + "test_data/EMPTY_PROTEINS" + +prodigal_tf: + "test_data/EMPTY_TF" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +compress: + false + +skip_logs: + false + +skip_prodigal_tf: + false + +compliant: + false + +centre: + 'Bactopia' + +nogenes: + false + +addmrna: + false + +rawproduct: + null + +cdsrnaolap: + null + +norrna: + null + +notrna: + null + +rnammer: + null + +prokka_evalue: + '1e-09' + +prokka_coverage: + '80' + +min_contig_len: + 500 diff --git a/modules/shovill/assemble_genome/README.md b/modules/shovill/assemble_genome/README.md new file mode 100644 index 000000000..721da776a --- /dev/null +++ b/modules/shovill/assemble_genome/README.md @@ -0,0 +1,18 @@ +# assemble_genome process testing: + +This process assemble the genome using Shovill, SKESA is used by default + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run asssemble_genome.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. + diff --git a/modules/shovill/assemble_genome/assemble_genome.nf b/modules/shovill/assemble_genome/assemble_genome.nf new file mode 100644 index 000000000..06ffbf464 --- /dev/null +++ b/modules/shovill/assemble_genome/assemble_genome.nf @@ -0,0 +1,70 @@ +nextflow.enable.dsl = 2 + +process ASSEMBLE_GENOME { + /* Assemble the genome using Shovill, SKESA is used by default */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "assembly/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${sample}-assembly-error.txt" + + input: + tuple val(sample), val(sample_type), val(single_end), path(fq), path(extra), path(genome_size) + + output: + path "assembly/*" + path "${sample}-assembly-error.txt" optional true + tuple val(sample), val(single_end), path("fastqs/${sample}*.fastq.gz"), path("assembly/${sample}.{fna,fna.gz}"),emit: SEQUENCE_TYPE, optional:true + tuple val(sample), val(single_end), path("assembly/${sample}.{fna,fna.gz}"), emit: MAKE_BLASTDB, optional: true + tuple val(sample), val(single_end), path("fastqs/${sample}*.fastq.gz"), path("assembly/${sample}.{fna,fna.gz}"), path("total_contigs_*"),emit: ANNOTATION, optional:true + tuple val(sample), path("assembly/${sample}.{fna,fna.gz}"), path(genome_size),emit: ASSEMBLY_QC, optional: true + path "${task.process}/*" optional true + + shell: + shovill_ram = task.memory.toString().split(' ')[0] + opts = params.shovill_opts ? "--opts '${params.shovill_opts}'" : "" + kmers = params.shovill_kmers ? "--kmers '${params.shovill_kmers}'" : "" + nostitch = params.nostitch ? "--nostitch" : "" + nocorr = params.nocorr ? "--nocorr" : "" + no_miniasm = params.no_miniasm ? "--no_miniasm" : "" + no_rotate = params.no_rotate ? "--no_rotate" : "" + no_pilon = params.no_pilon ? "--no_pilon" : "" + keep = params.keep_all_files ? "--keep 3" : "--keep 1" + use_original_assembly = null + if (sample_type.startsWith('assembly')) { + use_original_assembly = params.reassemble ? false : true + } + template "assemble_genome.sh" + + stub: + """ + mkdir assembly + mkdir fastqs + mkdir ${task.process} + touch total_contigs_${sample} + touch ${sample}-assembly-error.txt + touch fastqs/${sample}.fastq.gz + touch assembly/${sample} + touch assembly/${sample}.fna + touch assembly/${sample}.fna.gz + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + path(params.fq), + path(params.extra), + path(params.genome_size) + ]) + + assemble_genome(TEST_PARAMS_CH) +} diff --git a/modules/shovill/assemble_genome/bin/build-containers.sh b/modules/shovill/assemble_genome/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/shovill/assemble_genome/bin/check-assembly-accession.py b/modules/shovill/assemble_genome/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/shovill/assemble_genome/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/shovill/assemble_genome/bin/check-fastqs.py b/modules/shovill/assemble_genome/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/shovill/assemble_genome/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/shovill/assemble_genome/bin/check-staging.py b/modules/shovill/assemble_genome/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/shovill/assemble_genome/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/shovill/assemble_genome/bin/cleanup-coverage.py b/modules/shovill/assemble_genome/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/shovill/assemble_genome/bin/create-tool.sh b/modules/shovill/assemble_genome/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/shovill/assemble_genome/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/shovill/assemble_genome/bin/gh-actions/free-disk-space.sh b/modules/shovill/assemble_genome/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/shovill/assemble_genome/bin/gh-actions/setup-bactopia-env.sh b/modules/shovill/assemble_genome/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/shovill/assemble_genome/bin/gh-actions/setup-docker-builds.py b/modules/shovill/assemble_genome/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-build.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-citations.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-datasets.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-prepare.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-pull.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-search.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-summary.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-tools.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/shovill/assemble_genome/bin/helpers/bactopia-versions.py b/modules/shovill/assemble_genome/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/shovill/assemble_genome/bin/mask-consensus.py b/modules/shovill/assemble_genome/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/shovill/assemble_genome/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/shovill/assemble_genome/bin/merge-blast-json.py b/modules/shovill/assemble_genome/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/shovill/assemble_genome/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/shovill/assemble_genome/bin/mlst-blast.py b/modules/shovill/assemble_genome/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/shovill/assemble_genome/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/shovill/assemble_genome/bin/select-references.py b/modules/shovill/assemble_genome/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/shovill/assemble_genome/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/shovill/assemble_genome/bin/split-coverages.py b/modules/shovill/assemble_genome/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/shovill/assemble_genome/bin/update-conda.sh b/modules/shovill/assemble_genome/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/shovill/assemble_genome/bin/update-docker.sh b/modules/shovill/assemble_genome/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/shovill/assemble_genome/bin/update-tools.sh b/modules/shovill/assemble_genome/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/shovill/assemble_genome/bin/update-version.sh b/modules/shovill/assemble_genome/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/shovill/assemble_genome/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/shovill/assemble_genome/nextflow.config b/modules/shovill/assemble_genome/nextflow.config new file mode 100644 index 000000000..84e18edd8 --- /dev/null +++ b/modules/shovill/assemble_genome/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: assemble_genome { + conda = "${baseDir}/../../../conda/envs/assemble_genome-1.7.x"} + } + } + + docker { + process { + withName: assemble_genome { + container = "ghcr.io/bactopia/assemble_genome:1.6.0"} + + } + } + + test { + process.ext.template = {"${task.process}.sh"} + process { + echo = true + withName: assemble_genome { + cpus = 2 + memory = "8 GB" + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + + } +} diff --git a/modules/shovill/assemble_genome/templates/assemble_genome.sh b/modules/shovill/assemble_genome/templates/assemble_genome.sh new file mode 100755 index 000000000..08f3b21ef --- /dev/null +++ b/modules/shovill/assemble_genome/templates/assemble_genome.sh @@ -0,0 +1,159 @@ +#!/bin/bash +set -e +set -u +OUTDIR=assembly +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{extra} --genome_size !{genome_size} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{extra} --genome_size !{genome_size} + fi +fi + +GENOME_SIZE=`head -n 1 !{genome_size}` +if [ "!{sample_type}" == "hybrid" ]; then + echo "# unicycler Version" >> ${LOG_DIR}/!{task.process}.versions + unicycler --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + unicycler -1 !{fq[0]} -2 !{fq[1]} -l !{extra} \ + -o ${OUTDIR} \ + --no_correct \ + --min_fasta_length !{params.min_contig_len} \ + --threads !{task.cpus} \ + !{keep} --mode !{params.unicycler_mode} \ + !{no_miniasm} !{no_rotate} !{no_pilon} --min_polish_size !{params.min_polish_size} \ + --min_component_size !{params.min_component_size} \ + --min_dead_end_size !{params.min_dead_end_size} > ${LOG_DIR}/unicycler.out 2> ${LOG_DIR}/unicycler.err + sed -r 's/^>([0-9]+)(.*)/>gnl|\1|!{sample}\2/' ${OUTDIR}/assembly.fasta > ${OUTDIR}/!{sample}.fna + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/*.gfa + pigz -n --best -p !{task.cpus} ${OUTDIR}/*.fasta + fi +elif [ "!{use_original_assembly}" == "true" ]; then + mkdir ${OUTDIR} + gzip -cd !{extra} > ${OUTDIR}/!{sample}.fna +else + echo "# shovill Version" >> ${LOG_DIR}/!{task.process}.versions + shovill --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + shovill --check >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + if [ "!{params.assembler}" == "spades" ]; then + echo "# SPAdes Version (this assembler was used)" >> ${LOG_DIR}/!{task.process}.versions + spades.py --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + elif [ "!{params.assembler}" == "skesa" ]; then + echo "# SKESA Version (this assembler was used)" >> ${LOG_DIR}/!{task.process}.versions + skesa --version 2>&1 | tail -n 1 >> ${LOG_DIR}/!{task.process}.versions 2>&1 + elif [ "!{params.assembler}" == "velvet" ]; then + echo "# Velvet Version (this assembler was used)" >> ${LOG_DIR}/!{task.process}.versions + velvetg | grep "^Version" >> ${LOG_DIR}/!{task.process}.versions 2>&1 + else + echo "# MEGAHIT Version (this assembler was used)" >> ${LOG_DIR}/!{task.process}.versions + megahit --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + fi + + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + shovill --R1 !{fq[0]} --R2 !{fq[1]} --depth 0 --gsize ${GENOME_SIZE} \ + --outdir ${OUTDIR} \ + --force \ + --minlen !{params.min_contig_len} \ + --mincov !{params.min_contig_cov} \ + --namefmt "!{params.contig_namefmt}" \ + --keepfiles \ + --cpus !{task.cpus} \ + --ram !{shovill_ram} \ + --assembler !{params.assembler} \ + --noreadcorr !{opts} !{kmers} !{nostitch} !{nocorr} > ${LOG_DIR}/shovill.out 2> ${LOG_DIR}/shovill.err + else + # Single-End Reads + shovill-se --se !{fq[0]} --depth 0 --gsize ${GENOME_SIZE} \ + --outdir ${OUTDIR} \ + --force \ + --minlen !{params.min_contig_len} \ + --mincov !{params.min_contig_cov} \ + --namefmt "!{params.contig_namefmt}" \ + --keepfiles \ + --cpus !{task.cpus} \ + --ram !{shovill_ram} \ + --assembler !{params.assembler} !{opts} !{kmers} !{nocorr} > ${LOG_DIR}/shovill.out 2> ${LOG_DIR}/shovill.err + fi + sed -r 's/^>(contig[0-9]+)(.*)/>gnl|\1|!{sample}\2/' ${OUTDIR}/contigs.fa > ${OUTDIR}/!{sample}.fna + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/contigs.fa + fi + + if [ "!{params.keep_all_files}" == "false" ]; then + # Remove intermediate files + rm -fv ${OUTDIR}/shovill.bam* ${OUTDIR}/flash.extendedFrags* ${OUTDIR}/flash.notCombined* ${OUTDIR}/skesa.fasta.* ${OUTDIR}/*.fq.gz + fi +fi + +TOTAL_CONTIGS=`grep -c "^>" ${OUTDIR}/!{sample}.fna || true` +touch "total_contigs_${TOTAL_CONTIGS}" +if [ "${TOTAL_CONTIGS}" -gt "0" ]; then + assembly-scan ${OUTDIR}/!{sample}.fna > ${OUTDIR}/!{sample}.fna.json 2> ${LOG_DIR}/assembly-scan.err + TOTAL_CONTIG_SIZE=`grep "total_contig_length" ${OUTDIR}/!{sample}.fna.json | sed -r 's/.*: ([0-9]+)/\1/'` + if [ ${TOTAL_CONTIG_SIZE} -lt "!{params.min_genome_size}" ]; then + mv ${OUTDIR}/!{sample}.fna ${OUTDIR}/!{sample}-error.fna + mv ${OUTDIR}/!{sample}.fna.json ${OUTDIR}/!{sample}-error.fna.json + echo "!{sample} assembled size (${TOTAL_CONTIG_SIZE} bp) is less than the minimum allowed genome + size (!{params.min_genome_size} bp). If this is unexpected, please investigate !{sample} to + determine a cause (e.g. metagenomic, contaminants, etc...) for the poor assembly. + Otherwise, adjust the --min_genome_size parameter to fit your need. Further assembly + based analysis of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-assembly-error.txt + fi + + if [[ !{params.compress} == "true" ]]; then + pigz -n --best -p !{task.cpus} ${OUTDIR}/!{sample}.fna + fi +else + echo "!{sample} assembled successfully, but 0 contigs were formed. Please investigate + !{sample} to determine a cause (e.g. metagenomic, contaminants, etc...) for this + outcome. Further assembly-based analysis of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-assembly-error.txt +fi + +# pass the FASTQs along +mkdir -p fastqs +if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}.fastq.gz + fi +else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} fastqs/!{sample}_R1.fastq.gz + cp !{fq[1]} fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} fastqs/!{sample}.fastq.gz + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/shovill/assemble_genome/test_params.yaml b/modules/shovill/assemble_genome/test_params.yaml new file mode 100644 index 000000000..7a80318e9 --- /dev/null +++ b/modules/shovill/assemble_genome/test_params.yaml @@ -0,0 +1,95 @@ +genome_size: + "test_data/genome-size.txt" + +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +extra: + "test_data/empty.fna.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +shovill_opts: + null + +shovill_kmers: + null + +nostitch: + null + +nocorr: + null + +no_miniasm: + false + +no_rotate: + false + +no_pilon: + false + +min_contig_len: + 500 + +unicycler_mode: + "normal" + +min_polish_size: + 10000 + +min_component_size: + 1000 + +min_dead_end_size: + 1000 + +compress: + false + +assembler: + 'skesa' + +min_contig_cov: + 2 + +contig_namefmt: + 'contig%05d' + +min_genome_size: + '100000' + +keep_all_files: + false + +reassemble: + false + +skip_logs: + false + +shovill_ram: + 4 diff --git a/modules/utilities/download_references/README.md b/modules/utilities/download_references/README.md new file mode 100644 index 000000000..a7121266d --- /dev/null +++ b/modules/utilities/download_references/README.md @@ -0,0 +1,18 @@ +# download_references process testing: + +This process downloads the nearest RefSeq genomes (based on Mash) to have variants called against. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run download_references.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. + diff --git a/modules/utilities/download_references/bin/build-containers.sh b/modules/utilities/download_references/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/download_references/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/download_references/bin/check-assembly-accession.py b/modules/utilities/download_references/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/download_references/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/download_references/bin/check-fastqs.py b/modules/utilities/download_references/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/download_references/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/download_references/bin/check-staging.py b/modules/utilities/download_references/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/download_references/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/download_references/bin/cleanup-coverage.py b/modules/utilities/download_references/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/download_references/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/download_references/bin/create-tool.sh b/modules/utilities/download_references/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/download_references/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/download_references/bin/gh-actions/free-disk-space.sh b/modules/utilities/download_references/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/download_references/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/download_references/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/download_references/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/download_references/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/download_references/bin/gh-actions/setup-docker-builds.py b/modules/utilities/download_references/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/download_references/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-build.py b/modules/utilities/download_references/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/download_references/bin/helpers/bactopia-citations.py b/modules/utilities/download_references/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/download_references/bin/helpers/bactopia-datasets.py b/modules/utilities/download_references/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-prepare.py b/modules/utilities/download_references/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-pull.py b/modules/utilities/download_references/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-search.py b/modules/utilities/download_references/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-summary.py b/modules/utilities/download_references/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/download_references/bin/helpers/bactopia-tools.py b/modules/utilities/download_references/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/download_references/bin/helpers/bactopia-versions.py b/modules/utilities/download_references/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/download_references/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/download_references/bin/mask-consensus.py b/modules/utilities/download_references/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/download_references/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/download_references/bin/merge-blast-json.py b/modules/utilities/download_references/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/download_references/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/download_references/bin/mlst-blast.py b/modules/utilities/download_references/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/download_references/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/download_references/bin/select-references.py b/modules/utilities/download_references/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/download_references/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/download_references/bin/split-coverages.py b/modules/utilities/download_references/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/download_references/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/download_references/bin/update-conda.sh b/modules/utilities/download_references/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/download_references/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/download_references/bin/update-docker.sh b/modules/utilities/download_references/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/download_references/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/download_references/bin/update-tools.sh b/modules/utilities/download_references/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/download_references/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/download_references/bin/update-version.sh b/modules/utilities/download_references/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/download_references/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/download_references/download_references.nf b/modules/utilities/download_references/download_references.nf new file mode 100644 index 000000000..a4c0e6d16 --- /dev/null +++ b/modules/utilities/download_references/download_references.nf @@ -0,0 +1,62 @@ +nextflow.enable.dsl = 2 + +process DOWNLOAD_REFERENCES { + /* + Download the nearest RefSeq genomes (based on Mash) to have variants called against. + + Exitcode 75 is due to being unable to download from NCBI (e.g. FTP down at the time) + Downloads will be attempted 300 times total before giving up. On failure to download + variants will not be called against the nearest completed genome. + */ + tag "${sample} - ${params.max_references} reference(s)" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/variants/auto", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: 'mash-dist.txt' + + input: + tuple val(sample), val(single_end), path(fq), path(sample_sketch) + path(refseq_sketch) + + output: + tuple val(sample), val(single_end), path("fastqs/${sample}*.fastq.gz"), path("genbank/*.gbk"), emit:CALL_VARIANTS_AUTO, optional: true + path("mash-dist.txt") + file "${task.process}/*" optional true + + when: + REFSEQ_SKETCH_FOUND == true + + shell: + no_cache = params.no_cache ? '-N' : '' + tie_break = params.random_tie_break ? "--random_tie_break" : "" + total = params.max_references + template "download_references.sh" + + stub: + """ + mkdir fastqs + mkdir genbank + mkdir ${task.process} + touch fastqs/${sample}.fastq.gz + touch genbank/*.gbk + touch ${task.process}/${sample} + touch mash-dist.txt + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + path(params.sample_sketch) + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.refseq_sketch) + ) + download_references(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} + diff --git a/modules/utilities/download_references/nextflow.config b/modules/utilities/download_references/nextflow.config new file mode 100644 index 000000000..1d90451d4 --- /dev/null +++ b/modules/utilities/download_references/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: download_references { + conda = "${baseDir}/../../../conda/envs/download_references-1.7.x"} + } + } + + docker { + process { + withName: download_references { + container = "ghcr.io/bactopia/download_references:1.6.0"} + + } + } + + test { + process.ext.template = {"${task.process}.sh"} + process { + echo = true + withName: download_references { + cpus = 2 + queue = 'long' + } + + } + env { + REFSEQ_SKETCH_FOUND = true + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "fakse" + run_type = "fastqs" + } + + } +} diff --git a/modules/utilities/download_references/templates/download_references.sh b/modules/utilities/download_references/templates/download_references.sh new file mode 100644 index 000000000..708164a5f --- /dev/null +++ b/modules/utilities/download_references/templates/download_references.sh @@ -0,0 +1,84 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{sample_sketch} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{sample_sketch} + fi +fi + +# Get Mash distance +echo "# Mash Version" >> ${LOG_DIR}/!{task.process}.versions +mash --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +mash dist -t !{sample_sketch} !{refseq_sketch} | grep -v "query" | sort -k 2,2 > distances.txt + +# Pick genomes to download +printf "accession\tdistance\tlatest_accession\tupdated\n" > mash-dist.txt +select-references.py distances.txt !{total} !{tie_break} >> mash-dist.txt + +# Pick only latest accessions +grep -v distance mash-dist.txt | cut -f3 > download-list.txt + +# Download genomes +echo "# ncbi-genome-download Version" >> ${LOG_DIR}/!{task.process}.versions +ncbi-genome-download --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +ncbi-genome-download bacteria -l complete -o ./ -F genbank -p !{task.cpus} -A download-list.txt -r !{params.max_retry} !{no_cache} > ${LOG_DIR}/ncbi-genome-download.out 2> ${LOG_DIR}/ncbi-genome-download.err + +# Move and uncompress genomes +mkdir genbank_temp +find refseq -name "*.gbff.gz" | xargs -I {} mv {} genbank_temp/ +rename 's/(GC[AF]_\d+).*/$1/' genbank_temp/* +mkdir genbank +ls genbank_temp/ | xargs -I {} sh -c 'gzip -cd genbank_temp/{} > genbank/!{sample}-{}.gbk' +rm -rf genbank_temp + +if [ "!{params.keep_all_files}" == "false" ]; then + # Remove intermediate GenBank files + rm -rf refseq/ +fi + +# pass the FASTQs along +mkdir -p fastqs +if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}.fastq.gz + fi +else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} fastqs/!{sample}_R1.fastq.gz + cp !{fq[1]} fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} fastqs/!{sample}.fastq.gz + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/download_references/test_params.yaml b/modules/utilities/download_references/test_params.yaml new file mode 100644 index 000000000..710903cae --- /dev/null +++ b/modules/utilities/download_references/test_params.yaml @@ -0,0 +1,47 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +sample_sketch: + "test_data/SRR2838702-k31.msh" + +refseq_sketch: + "test_data/refseq-genomes.msh" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +max_references: + 1 + +no_cache: + false + +random_tie_break: + false + +max_retry: + 3 + +keep_all_files: + true + +skip_logs: + false diff --git a/modules/utilities/fastq_status/README.md b/modules/utilities/fastq_status/README.md new file mode 100644 index 000000000..ce5921408 --- /dev/null +++ b/modules/utilities/fastq_status/README.md @@ -0,0 +1,17 @@ +# fastq_status process testing: + +This process Determine if FASTQs are PE or SE, and if they meet minimum basepair/read counts. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run fastq_status.nf -profile test,docker -params-file test_params.yaml -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by `conda` to test with conda. \ No newline at end of file diff --git a/modules/utilities/fastq_status/bin/build-containers.sh b/modules/utilities/fastq_status/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/fastq_status/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/fastq_status/bin/check-assembly-accession.py b/modules/utilities/fastq_status/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/fastq_status/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/fastq_status/bin/check-fastqs.py b/modules/utilities/fastq_status/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/fastq_status/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/fastq_status/bin/check-staging.py b/modules/utilities/fastq_status/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/fastq_status/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/fastq_status/bin/cleanup-coverage.py b/modules/utilities/fastq_status/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/fastq_status/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/fastq_status/bin/create-tool.sh b/modules/utilities/fastq_status/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/fastq_status/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/fastq_status/bin/gh-actions/free-disk-space.sh b/modules/utilities/fastq_status/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/fastq_status/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/fastq_status/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/fastq_status/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/fastq_status/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/fastq_status/bin/gh-actions/setup-docker-builds.py b/modules/utilities/fastq_status/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/fastq_status/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-build.py b/modules/utilities/fastq_status/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-citations.py b/modules/utilities/fastq_status/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-datasets.py b/modules/utilities/fastq_status/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-prepare.py b/modules/utilities/fastq_status/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-pull.py b/modules/utilities/fastq_status/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-search.py b/modules/utilities/fastq_status/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-summary.py b/modules/utilities/fastq_status/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-tools.py b/modules/utilities/fastq_status/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/fastq_status/bin/helpers/bactopia-versions.py b/modules/utilities/fastq_status/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/fastq_status/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/fastq_status/bin/mask-consensus.py b/modules/utilities/fastq_status/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/fastq_status/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/fastq_status/bin/merge-blast-json.py b/modules/utilities/fastq_status/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/fastq_status/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/fastq_status/bin/mlst-blast.py b/modules/utilities/fastq_status/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/fastq_status/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/fastq_status/bin/select-references.py b/modules/utilities/fastq_status/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/fastq_status/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/fastq_status/bin/split-coverages.py b/modules/utilities/fastq_status/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/fastq_status/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/fastq_status/bin/update-conda.sh b/modules/utilities/fastq_status/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/fastq_status/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/fastq_status/bin/update-docker.sh b/modules/utilities/fastq_status/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/fastq_status/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/fastq_status/bin/update-tools.sh b/modules/utilities/fastq_status/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/fastq_status/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/fastq_status/bin/update-version.sh b/modules/utilities/fastq_status/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/fastq_status/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/fastq_status/fastq_status.nf b/modules/utilities/fastq_status/fastq_status.nf new file mode 100644 index 000000000..0121b447d --- /dev/null +++ b/modules/utilities/fastq_status/fastq_status.nf @@ -0,0 +1,47 @@ +nextflow.enable.dsl = 2 + +process FASTQ_STATUS { + /* Determine if FASTQs are PE or SE, and if they meet minimum basepair/read counts. */ + publishDir "${params.outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${params.outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: '*.txt' + + input: + tuple val(sample), val(sample_type), val(single_end), path(fq), path(extra) + output: + file "*-error.txt" optional true + tuple val(sample), val(sample_type), val(single_end), + path("fastqs/${sample}*.fastq.gz"), path(extra),emit: ESTIMATE_GENOME_SIZE, optional: true + file "${task.process}/*" optional true + + shell: + single_end = fq[1] == null ? true : false + qin = sample_type.startsWith('assembly') ? 'qin=33' : 'qin=auto' + + template "fastq_status.sh" + + stub: + """ + mkdir ${task.process} + mkdir fastqs + touch ${sample}-error.txt + touch fastqs/${sample}.fastq.gz + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + path(params.fq), + path(params.extra) + ]) + + fastq_status(TEST_PARAMS_CH) +} diff --git a/modules/utilities/fastq_status/nextflow.config b/modules/utilities/fastq_status/nextflow.config new file mode 100644 index 000000000..5cf4b4e48 --- /dev/null +++ b/modules/utilities/fastq_status/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: fastq_status { + conda = "${baseDir}/../../../conda/envs/qc_reads-1.6.x"} + } + } + + docker { + process { + withName: fastq_status { + container = "ghcr.io/bactopia/qc_reads:1.6.0"} + + } + } + + test { + process { + echo = true + withName: fastq_status { + cpus = 1 + memory = "2 GB" + queue = 'long' + } + } + + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} \ No newline at end of file diff --git a/modules/utilities/fastq_status/templates/fastq_status.sh b/modules/utilities/fastq_status/templates/fastq_status.sh new file mode 100644 index 000000000..708ac06c7 --- /dev/null +++ b/modules/utilities/fastq_status/templates/fastq_status.sh @@ -0,0 +1,80 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +ERROR=0 +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{extra} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{extra} + fi +fi + +if [ "!{params.skip_fastq_check}" == "false" ]; then + # Not completely sure about the inputs, so make sure they meet minimum requirements + echo "# fastq-scan Version" >> ${LOG_DIR}/!{task.process}.versions + fastq-scan -v >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + # Check paired-end reads have same read counts + gzip -cd !{fq[0]} | fastq-scan > r1.json + OPTS="--sample !{sample} --min_basepairs !{params.min_basepairs} --min_reads !{params.min_reads} --min_proportion !{params.min_proportion}" + if [ "!{single_end}" == "false" ]; then + if ! reformat.sh in1=!{fq[0]} in2=!{fq[1]} !{qin} out=/dev/null 2> !{sample}-paired-end-error.txt; then + ERROR=1 + echo "!{sample} FASTQs contains an error. Please check the input FASTQs. + Further analysis is discontinued." | \ + sed 's/^\s*//' >> !{sample}-paired-end-error.txt + else + rm -f !{sample}-paired-end-error.txt + fi + gzip -cd !{fq[1]} | fastq-scan > r2.json + + if ! check-fastqs.py --fq1 r1.json --fq2 r2.json ${OPTS}; then + ERROR=1 + fi + rm r1.json r2.json + else + if ! check-fastqs.py --fq1 r1.json ${OPTS}; then + ERROR=1 + fi + rm r1.json + fi +fi + +if [ "${ERROR}" -eq "0" ]; then + mkdir -p fastqs + if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` fastqs/!{sample}.fastq.gz + fi + else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} fastqs/!{sample}_R1.fastq.gz + cp !{fq[1]} fastqs/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} fastqs/!{sample}.fastq.gz + fi + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/fastq_status/test_params.yaml b/modules/utilities/fastq_status/test_params.yaml new file mode 100644 index 000000000..30c6aa94c --- /dev/null +++ b/modules/utilities/fastq_status/test_params.yaml @@ -0,0 +1,62 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" +extra: + "test_data/empty.fna.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +no_cache: + " " + +use_ena: + " " + +coverage: + "1" + +max_retry: + " " + +sampleseed: + " " + +skip_logs: + " " + +skip_fastq_check: + false + +min_basepairs: + '2241820' + +min_reads: + '7472' + +min_proportion: + 0.5 + + + + diff --git a/modules/utilities/gather_fastqs/README.md b/modules/utilities/gather_fastqs/README.md new file mode 100644 index 000000000..54efae86b --- /dev/null +++ b/modules/utilities/gather_fastqs/README.md @@ -0,0 +1,17 @@ +# gather_fastqs process testing: + +This process handles the input files into channels for other process in the workflow. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run gather_fastqs.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. \ No newline at end of file diff --git a/modules/utilities/gather_fastqs/bin/build-containers.sh b/modules/utilities/gather_fastqs/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/gather_fastqs/bin/check-assembly-accession.py b/modules/utilities/gather_fastqs/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/gather_fastqs/bin/check-fastqs.py b/modules/utilities/gather_fastqs/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/gather_fastqs/bin/check-staging.py b/modules/utilities/gather_fastqs/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/gather_fastqs/bin/cleanup-coverage.py b/modules/utilities/gather_fastqs/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/gather_fastqs/bin/create-tool.sh b/modules/utilities/gather_fastqs/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/gather_fastqs/bin/gh-actions/free-disk-space.sh b/modules/utilities/gather_fastqs/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/gather_fastqs/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/gather_fastqs/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/gather_fastqs/bin/gh-actions/setup-docker-builds.py b/modules/utilities/gather_fastqs/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-build.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-citations.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-datasets.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-prepare.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-pull.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-search.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-summary.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-tools.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/gather_fastqs/bin/helpers/bactopia-versions.py b/modules/utilities/gather_fastqs/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/gather_fastqs/bin/mask-consensus.py b/modules/utilities/gather_fastqs/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/gather_fastqs/bin/merge-blast-json.py b/modules/utilities/gather_fastqs/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/gather_fastqs/bin/mlst-blast.py b/modules/utilities/gather_fastqs/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/gather_fastqs/bin/select-references.py b/modules/utilities/gather_fastqs/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/gather_fastqs/bin/split-coverages.py b/modules/utilities/gather_fastqs/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/gather_fastqs/bin/update-conda.sh b/modules/utilities/gather_fastqs/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/gather_fastqs/bin/update-docker.sh b/modules/utilities/gather_fastqs/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/gather_fastqs/bin/update-tools.sh b/modules/utilities/gather_fastqs/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/gather_fastqs/bin/update-version.sh b/modules/utilities/gather_fastqs/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/gather_fastqs/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/gather_fastqs/gather_fastqs.nf b/modules/utilities/gather_fastqs/gather_fastqs.nf new file mode 100644 index 000000000..cb4ee7081 --- /dev/null +++ b/modules/utilities/gather_fastqs/gather_fastqs.nf @@ -0,0 +1,88 @@ +nextflow.enable.dsl = 2 + +process GATHER_FASTQS { + /* Gather up input FASTQs for analysis. */ + publishDir "${params.outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${params.outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "bactopia.versions" + publishDir "${params.outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: '*.txt' + + tag "${sample}" + + input: + tuple val(sample), val(sample_type), val(single_end), path(r1: '*???-r1'), path(r2: '*???-r2'), path(extra) + + output: + path("*-error.txt") optional true + tuple val(sample), val(final_sample_type), val(single_end), + path("fastqs/${sample}*.fastq.gz"), path("extra/*.gz"), emit: FASTQ_PE_STATUS, optional: true + path("${task.process}/*") optional true + path("bactopia.versions") optional true + path("multiple-read-sets-merged.txt") optional true + + shell: + bactopia_version = VERSION + nextflow_version = nextflow.version + is_assembly = sample_type.startsWith('assembly') ? true : false + is_compressed = false + no_cache = params.no_cache ? '-N' : '' + use_ena = params.use_ena + if (task.attempt >= 4) { + if (use_ena) { + // Try SRA + use_ena = false + } else { + // Try ENA + use_ena = true + } + } + if (extra) { + is_compressed = extra.getName().endsWith('gz') ? true : false + } + section = null + if (sample_type == 'assembly_accession') { + section = sample.startsWith('GCF') ? 'refseq' : 'genbank' + } + fcov = params.coverage.toInteger() == 0 ? 150 : Math.round(params.coverage.toInteger() * 1.5) + final_sample_type = sample_type + if (sample_type == 'hybrid-merge-pe') { + final_sample_type = 'hybrid' + } else if (sample_type == 'merge-pe') { + final_sample_type = 'paired-end' + } else if (sample_type == 'merge-se') { + final_sample_type = 'single-end' + } + + template "gather_fastqs.sh" + + stub: + final_sample_type = 'single-end' + """ + mkdir fastqs + mkdir extra + mkdir ${task.process} + touch ${sample}-error.txt + touch fastqs/${sample}.fastq.gz + touch extra/${sample}.gz + touch ${task.process}/${sample} + touch bactopia.versions + touch multiple-read-sets-merged.txt + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + test_params_input = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + params.r1, + params.r2, + params.extra + ]) + + gather_fastqs(test_params_input) +} diff --git a/modules/utilities/gather_fastqs/nextflow.config b/modules/utilities/gather_fastqs/nextflow.config new file mode 100644 index 000000000..d4228cf60 --- /dev/null +++ b/modules/utilities/gather_fastqs/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: gather_fastqs { + conda = "${baseDir}/../../../conda/envs/gather_fastqs-1.6.x"} + } + } + + docker { + process { + withName: gather_fastqs { + container = "ghcr.io/bactopia/gather_fastqs:1.6.0"} + + } + } + + test { + process { + echo = true + withName: gather_fastqs { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} \ No newline at end of file diff --git a/modules/utilities/gather_fastqs/templates/gather_fastqs.sh b/modules/utilities/gather_fastqs/templates/gather_fastqs.sh new file mode 100644 index 000000000..bec5dd9fa --- /dev/null +++ b/modules/utilities/gather_fastqs/templates/gather_fastqs.sh @@ -0,0 +1,174 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +MERGED="multiple-read-sets-merged.txt" +mkdir -p fastqs +mkdir -p extra +mkdir -p ${LOG_DIR} + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Bactopia Version Info +echo "# Timestamp" > bactopia.versions +date --iso-8601=seconds >> bactopia.versions +echo "# Bactopia Version" >> bactopia.versions +echo "bactopia !{bactopia_version}" >> bactopia.versions +echo "# Nextflow Version" >> bactopia.versions +echo "nextflow !{nextflow_version}" >> bactopia.versions +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +if [ "!{sample_type}" == "paired-end" ]; then + # Paired-End Reads + ln -s `readlink !{r1[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{r2[0]}` fastqs/!{sample}_R2.fastq.gz + touch extra/empty.fna.gz +elif [ "!{sample_type}" == "single-end" ]; then + # Single-End Reads + ln -s `readlink !{r1[0]}` fastqs/!{sample}.fastq.gz + touch extra/empty.fna.gz +elif [ "!{sample_type}" == "hybrid" ]; then + # Paired-End Reads + ln -s `readlink !{r1[0]}` fastqs/!{sample}_R1.fastq.gz + ln -s `readlink !{r2[0]}` fastqs/!{sample}_R2.fastq.gz + ln -s `readlink !{extra}` extra/!{sample}.fastq.gz +elif [ "!{sample_type}" == "merge-pe" ]; then + # Merge Paired-End Reads + echo "This sample had reads merged." > ${MERGED} + echo "R1:" >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} ls -l {} | awk '{print $5"\t"$9}' >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} cat {} > fastqs/!{sample}_R1.fastq.gz + echo "Merged R1:" >> ${MERGED} + ls -l fastqs/!{sample}_R1.fastq.gz | awk '{print $5"\t"$9}' >> ${MERGED} + + echo "R2:" >> ${MERGED} + find -name "*r2" | sort | xargs -I {} readlink {} | xargs -I {} ls -l {} | awk '{print $5"\t"$9}' >> ${MERGED} + find -name "*r2" | sort | xargs -I {} readlink {} | xargs -I {} cat {} > fastqs/!{sample}_R2.fastq.gz + echo "Merged R2:" >> ${MERGED} + ls -l fastqs/!{sample}_R2.fastq.gz | awk '{print $5"\t"$9}' >> ${MERGED} + + touch extra/empty.fna.gz +elif [ "!{sample_type}" == "hybrid-merge-pe" ]; then + # Merge Paired-End Reads + echo "This sample had reads merged." > ${MERGED} + echo "R1:" >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} ls -l {} | awk '{print $5"\t"$9}' >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} cat {} > fastqs/!{sample}_R1.fastq.gz + echo "Merged R1:" >> ${MERGED} + ls -l fastqs/!{sample}_R1.fastq.gz | awk '{print $5"\t"$9}' >> ${MERGED} + + echo "R2:" >> ${MERGED} + find -name "*r2" | sort | xargs -I {} readlink {} | xargs -I {} ls -l {} >> ${MERGED} + find -name "*r2" | sort | xargs -I {} readlink {} | xargs -I {} cat {} > fastqs/!{sample}_R2.fastq.gz + echo "Merged R2:" >> ${MERGED} + ls -l fastqs/!{sample}_R2.fastq.gz | awk '{print $5"\t"$9}' >> ${MERGED} + + ln -s `readlink !{extra}` extra/!{sample}.fastq.gz +elif [ "!{sample_type}" == "merge-se" ]; then + # Merge Single-End Reads + echo "This sample had reads merged." > ${MERGED} + echo "SE:" >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} ls -l {} | awk '{print $5"\t"$9}' >> ${MERGED} + find -name "*r1" | sort | xargs -I {} readlink {} | xargs -I {} cat {} > fastqs/!{sample}.fastq.gz + echo "Merged SE:" >> ${MERGED} + ls -l fastqs/!{sample}.fastq.gz | awk '{print $5"\t"$9}' >> ${MERGED} + + touch extra/empty.fna.gz +elif [ "!{sample_type}" == "sra_accession" ]; then + # Download accession from ENA/SRA + FTP_ONLY="--ftp_only" + ARCHIVE="" + + # Check if ascp is available + if [ "!{use_ena}" == "true" ]; then + ARCHIVE="ENA" + else + ARCHIVE="SRA" + fi + + # fastq-dl Version + echo "# fastq-dl Version" >> ${LOG_DIR}/!{task.process}.versions + fastq-dl --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + if [ "!{task.attempt}" == "!{params.max_retry}" ]; then + echo "Unable to download !{sample} from both SRA and ENA !{params.max_retry} times. This may or may + not be a temporary connection issue. Rather than stop the whole Bactopia run, + further analysis of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-fastq-download-error.txt + exit + else + # Download accession from ENA/SRA + fastq-dl !{sample} $ARCHIVE \ + --cpus !{task.cpus} \ + --outdir fastqs/ \ + --group_by_experiment \ + --is_experiment $FTP_ONLY > ${LOG_DIR}/fastq-dl.out 2> ${LOG_DIR}/fastq-dl.err + touch extra/empty.fna.gz + fi +elif [ "!{is_assembly}" == "true" ]; then + if [ "!{sample_type}" == "assembly_accession" ]; then + # ncbi-genome-download Version + echo "# ncbi-genome-download Version" >> ${LOG_DIR}/!{task.process}.versions + ncbi-genome-download --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + if [ "!{task.attempt}" == "!{params.max_retry}" ]; then + touch extra/empty.fna.gz + echo "Unable to download !{sample} from NCBI Assembly !{params.max_retry} times. This may or may + not be a temporary connection issue. Rather than stop the whole Bactopia run, + further analysis of !{sample} will be discontinued." | \ + sed 's/^\s*//' > !{sample}-assembly-download-error.txt + exit + else + # Verify Assembly accession + check-assembly-accession.py !{sample} > accession.txt 2> ${LOG_DIR}/check-assembly-accession.txt + + if [ -s "accession.txt" ]; then + # Download from NCBI assembly and simulate reads + mkdir fasta/ + ncbi-genome-download bacteria -o ./ -F fasta -p !{task.cpus} \ + -s !{section} -A accession.txt -r 50 !{no_cache} > ${LOG_DIR}/ncbi-genome-download.out 2> ${LOG_DIR}/ncbi-genome-download.err + find . -name "*!{sample}*.fna.gz" | xargs -I {} mv {} fasta/ + rename 's/(GC[AF]_\d+).*/$1.fna.gz/' fasta/* + gzip -cd fasta/!{sample}.fna.gz > !{sample}-art.fna + else + cp ${LOG_DIR}/check-assembly-accession.txt !{sample}-assembly-accession-error.txt + exit + fi + fi + elif [ "!{sample_type}" == "assembly" ]; then + if [ "!{is_compressed}" == "true" ]; then + gzip -cd !{extra} > !{sample}-art.fna + else + cat !{extra} > !{sample}-art.fna + fi + fi + # ART Version + echo "# ART Version" >> ${LOG_DIR}/!{task.process}.versions + art_illumina --help | head -n 6 | tail -n 5 >> ${LOG_DIR}/!{task.process}.versions 2>&1 + + # Simulate reads from assembly, reads are 250bp without errors + art_illumina -p -ss MSv3 -l 250 -m 400 -s 30 --fcov !{fcov} \ + -ir 0 -ir2 0 -dr 0 -dr2 0 -rs !{params.sampleseed} \ + -na -qL 33 -qU 40 -o !{sample}_R \ + --id !{sample} -i !{sample}-art.fna > ${LOG_DIR}/art.out 2> ${LOG_DIR}/art.err + + mv !{sample}_R1.fq fastqs/!{sample}_R1.fastq + mv !{sample}_R2.fq fastqs/!{sample}_R2.fastq + pigz -p !{task.cpus} --fast fastqs/*.fastq + cp !{sample}-art.fna extra/!{sample}.fna + pigz -p !{task.cpus} --best extra/!{sample}.fna +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/gather_fastqs/test_params.yaml b/modules/utilities/gather_fastqs/test_params.yaml new file mode 100644 index 000000000..49cc22f37 --- /dev/null +++ b/modules/utilities/gather_fastqs/test_params.yaml @@ -0,0 +1,54 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +r1: + "test_data/SRR2838702_R1.fastq.gz" + +r2: + "test_data/SRR2838702_R2.fastq.gz" + +extra: + "test_data/extra.fastq.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +no_cache: + " " + +use_ena: + " " + +coverage: + "1" + +max_retry: + " " + +sampleseed: + " " + +skip_logs: + " " + + + + diff --git a/modules/utilities/quality_control/assembly_qc/README.md b/modules/utilities/quality_control/assembly_qc/README.md new file mode 100644 index 000000000..470948b13 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/README.md @@ -0,0 +1,17 @@ +# assembly_qc process testing: + +This process assess the quality of the assembly using QUAST and CheckM + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run assembly_qc.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/utilities/quality_control/assembly_qc/assembly_qc.nf b/modules/utilities/quality_control/assembly_qc/assembly_qc.nf new file mode 100644 index 000000000..0b1dd4361 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/assembly_qc.nf @@ -0,0 +1,48 @@ +nextflow.enable.dsl = 2 + +process ASSEMBLY_QC { + /* Assess the quality of the assembly using QUAST and CheckM */ + tag "${sample} - ${method}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/assembly", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${method}/*" + + input: + tuple val(sample), path(fasta), path(genome_size) + each method + + output: + file "${method}/*" + file "${task.process}/*" optional true + + shell: + //CheckM Related + full_tree = params.full_tree ? '' : '--reduced_tree' + checkm_ali = params.checkm_ali ? '--ali' : '' + checkm_nt = params.checkm_nt ? '--nt' : '' + force_domain = params.force_domain ? '--force_domain' : '' + no_refinement = params.no_refinement ? '--no_refinement' : '' + individual_markers = params.individual_markers ? '--individual_markers' : '' + skip_adj_correction = params.skip_adj_correction ? '--skip_adj_correction' : '' + skip_pseudogene_correction = params.skip_pseudogene_correction ? '--skip_pseudogene_correction' : '' + ignore_thresholds = params.ignore_thresholds ? '--ignore_thresholds' : '' + template "assembly_qc.sh" + +} + +//############### +//Module testing +//############### + + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + path(params.fasta), + path(params.genome_size) + ]) + TEST_PARAMS_CH2 = Channel.of('checkm', 'quast') + + assembly_qc(TEST_PARAMS_CH,TEST_PARAMS_CH2) +} diff --git a/modules/utilities/quality_control/assembly_qc/bin/build-containers.sh b/modules/utilities/quality_control/assembly_qc/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/quality_control/assembly_qc/bin/check-assembly-accession.py b/modules/utilities/quality_control/assembly_qc/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/quality_control/assembly_qc/bin/check-fastqs.py b/modules/utilities/quality_control/assembly_qc/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/quality_control/assembly_qc/bin/check-staging.py b/modules/utilities/quality_control/assembly_qc/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/quality_control/assembly_qc/bin/cleanup-coverage.py b/modules/utilities/quality_control/assembly_qc/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/quality_control/assembly_qc/bin/create-tool.sh b/modules/utilities/quality_control/assembly_qc/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/quality_control/assembly_qc/bin/gh-actions/free-disk-space.sh b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-docker-builds.py b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-build.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-citations.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-datasets.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-prepare.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-pull.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-search.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-summary.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-tools.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-versions.py b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/quality_control/assembly_qc/bin/mask-consensus.py b/modules/utilities/quality_control/assembly_qc/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/quality_control/assembly_qc/bin/merge-blast-json.py b/modules/utilities/quality_control/assembly_qc/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/quality_control/assembly_qc/bin/mlst-blast.py b/modules/utilities/quality_control/assembly_qc/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/quality_control/assembly_qc/bin/select-references.py b/modules/utilities/quality_control/assembly_qc/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/quality_control/assembly_qc/bin/split-coverages.py b/modules/utilities/quality_control/assembly_qc/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/quality_control/assembly_qc/bin/update-conda.sh b/modules/utilities/quality_control/assembly_qc/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/assembly_qc/bin/update-docker.sh b/modules/utilities/quality_control/assembly_qc/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/quality_control/assembly_qc/bin/update-tools.sh b/modules/utilities/quality_control/assembly_qc/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/assembly_qc/bin/update-version.sh b/modules/utilities/quality_control/assembly_qc/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/quality_control/assembly_qc/nextflow.config b/modules/utilities/quality_control/assembly_qc/nextflow.config new file mode 100644 index 000000000..32bc55f82 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/nextflow.config @@ -0,0 +1,52 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: assembly_qc { + conda = "${baseDir}/../../../../conda/envs/assembly_qc-1.7.x"} + } + } + + docker { + process { + withName: assembly_qc { + container = "ghcr.io/bactopia/assembly_qc:1.6.0"} + + } + } + + test { + process { + echo = true + withName: assembly_qc { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + LOG_DIR = "qc_reads/" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + uname = "null" + GENOME_SIZE = 20 + est_ref_size = 10 + } + + } +} diff --git a/modules/utilities/quality_control/assembly_qc/templates/assembly_qc.sh b/modules/utilities/quality_control/assembly_qc/templates/assembly_qc.sh new file mode 100644 index 000000000..96ee0b03d --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/templates/assembly_qc.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -e +set -u +OUTDIR=!{method} +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" >> ${LOG_DIR}/!{task.process}-!{method}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}-!{method}.versions + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fasta}" ]]; then + check-staging.py --assembly !{fasta} --genome_size !{genome_size} +fi + +if [ "!{method}" == "checkm" ]; then + # CheckM + mkdir checkm/ + if [ "$(uname)" = Darwin ]; then + echo "checkm is not available due to pplacer not being available on MacOSX (via BioConda)" > checkm/checkm-not-available-on-macosx.txt + elif [[ "!{params.skip_checkm}" == "true" ]]; then + echo "checkm was skipped due to '--skip_checkm'" > checkm/checkm-was-skipped.txt + else + echo "# CheckM Version" >> ${LOG_DIR}/!{task.process}-!{method}.versions + checkm -h | grep ":::" >> ${LOG_DIR}/!{task.process}-!{method}.versions 2>&1 + + checkm lineage_wf ./ checkm/ \ + !{full_tree} --alignment_file checkm/checkm-genes.aln \ + --tab_table \ + --file checkm/checkm-results.txt \ + --threads !{task.cpus} \ + !{checkm_ali} !{checkm_nt} --pplacer_threads !{task.cpus} \ + !{force_domain} !{no_refinement} --unique !{params.checkm_unique} \ + !{individual_markers} !{skip_adj_correction} --multi !{params.checkm_multi} \ + !{skip_pseudogene_correction} !{ignore_thresholds} --aai_strain !{params.aai_strain} \ + --length !{params.checkm_length} > ${LOG_DIR}/checkm.out 2> ${LOG_DIR}/checkm.err + + if [[ !{params.compress} == "true" ]]; then + find . -name "*.faa" -or -name "*hmmer.analyze.txt" | xargs -I {} pigz -n --best -p !{task.cpus} {} + fi + fi +else + # QUAST + echo "# QUAST Version" >> ${LOG_DIR}/!{task.process}-!{method}.versions + quast --version >> ${LOG_DIR}/!{task.process}-!{method}.versions 2>&1 + GENOME_SIZE=`head -n 1 !{genome_size}` + est_ref_size="" + if [ "${GENOME_SIZE}" != "0" ]; then + est_ref_size="--est-ref-size ${GENOME_SIZE}" + fi + quast !{fasta} ${est_ref_size} \ + -o quast \ + --threads !{task.cpus} \ + --glimmer \ + --contig-thresholds !{params.contig_thresholds} \ + --plots-format !{params.plots_format} > ${LOG_DIR}/quast.out 2> ${LOG_DIR}/quast.err +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}-!{method}.err + cp .command.out ${LOG_DIR}/!{task.process}-!{method}.out + cp .command.sh ${LOG_DIR}/!{task.process}-!{method}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}-!{method}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/quality_control/assembly_qc/test_params.yaml b/modules/utilities/quality_control/assembly_qc/test_params.yaml new file mode 100644 index 000000000..ec5347150 --- /dev/null +++ b/modules/utilities/quality_control/assembly_qc/test_params.yaml @@ -0,0 +1,83 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fasta: + "test_data/SRR2838702.fna" + +genome_size: + "test_data/genome-size.txt" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +no_refinement: + null + +individual_markers: + null + +checkm_nt: + null + +force_domain: + null + +skip_pseudogene_correction: + null + +ignore_thresholds: + null + +full_tree: + null + +skip_adj_correction: + null + +checkm_ali: + null + +skip_checkm: + false + +checkm_unique: + 10 + +checkm_multi: + 10 + +aai_strain: + 0.9 + +checkm_length: + 0.7 + +compress: + false + +contig_thresholds: + '0,1000,10000,100000,250000,1000000' + +plots_format: + 'pdf' + +skip_logs: + false diff --git a/modules/utilities/quality_control/qc_final_summary/README.md b/modules/utilities/quality_control/qc_final_summary/README.md new file mode 100644 index 000000000..1d6dd7063 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/README.md @@ -0,0 +1,17 @@ +# qc_final_summary process testing: + +This process run FASTQC on the input FASTQ files + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run qc_final_summary.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/utilities/quality_control/qc_final_summary/bin/build-containers.sh b/modules/utilities/quality_control/qc_final_summary/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/quality_control/qc_final_summary/bin/check-assembly-accession.py b/modules/utilities/quality_control/qc_final_summary/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/check-fastqs.py b/modules/utilities/quality_control/qc_final_summary/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/check-staging.py b/modules/utilities/quality_control/qc_final_summary/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/cleanup-coverage.py b/modules/utilities/quality_control/qc_final_summary/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/create-tool.sh b/modules/utilities/quality_control/qc_final_summary/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/free-disk-space.sh b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-docker-builds.py b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-build.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-citations.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-datasets.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-prepare.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-pull.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-search.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-summary.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-tools.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-versions.py b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/quality_control/qc_final_summary/bin/mask-consensus.py b/modules/utilities/quality_control/qc_final_summary/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/merge-blast-json.py b/modules/utilities/quality_control/qc_final_summary/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/mlst-blast.py b/modules/utilities/quality_control/qc_final_summary/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/quality_control/qc_final_summary/bin/select-references.py b/modules/utilities/quality_control/qc_final_summary/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/quality_control/qc_final_summary/bin/split-coverages.py b/modules/utilities/quality_control/qc_final_summary/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/quality_control/qc_final_summary/bin/update-conda.sh b/modules/utilities/quality_control/qc_final_summary/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_final_summary/bin/update-docker.sh b/modules/utilities/quality_control/qc_final_summary/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/quality_control/qc_final_summary/bin/update-tools.sh b/modules/utilities/quality_control/qc_final_summary/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_final_summary/bin/update-version.sh b/modules/utilities/quality_control/qc_final_summary/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_final_summary/nextflow.config b/modules/utilities/quality_control/qc_final_summary/nextflow.config new file mode 100644 index 000000000..768764102 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: qc_final_summary { + conda = "${baseDir}/../../../../conda/envs/qc_reads-1.7.x"} + } + } + + docker { + process { + withName: qc_final_summary { + container = "ghcr.io/bactopia/qc_reads:1.6.0"} + + } + } + + test { + process { + echo = true + withName: qc_final_summary { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + + } +} diff --git a/modules/utilities/quality_control/qc_final_summary/qc_final_summary.nf b/modules/utilities/quality_control/qc_final_summary/qc_final_summary.nf new file mode 100644 index 000000000..bf933dd9d --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/qc_final_summary.nf @@ -0,0 +1,44 @@ +nextflow.enable.dsl = 2 + +process QC_FINAL_SUMMARY { + /* Run FASTQC on the cleaned up FASTQ files. */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "quality-control/*" + + input: + tuple val(sample), val(single_end), path(fq), path(genome_size) + + output: + file "quality-control/*" + file "${task.process}/*" optional true + + shell: + + template "qc_final_summary.sh" + + stub: + """ + mkdir quality-control + mkdir ${task.process} + touch quality-control/${sample} + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + path(params.genome_size) + ]) + + qc_final_summary(TEST_PARAMS_CH) +} diff --git a/modules/utilities/quality_control/qc_final_summary/templates/qc_final_summary.sh b/modules/utilities/quality_control/qc_final_summary/templates/qc_final_summary.sh new file mode 100644 index 000000000..e1763b0ff --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/templates/qc_final_summary.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +echo "# FastQC Version" >> ${LOG_DIR}/!{task.process}.versions +fastqc -version>> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# fastq-scan Version" >> ${LOG_DIR}/!{task.process}.versions +fastq-scan -v >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --genome_size !{genome_size} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --genome_size !{genome_size} + fi +fi + +GENOME_SIZE=`head -n 1 !{genome_size}` +if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + gzip -cd !{fq[0]} | fastq-scan -g ${GENOME_SIZE} > !{sample}_R1-final.json + gzip -cd !{fq[1]} | fastq-scan -g ${GENOME_SIZE} > !{sample}_R2-final.json + ln -s !{fq[0]} !{sample}_R1-final.fastq.gz + ln -s !{fq[1]} !{sample}_R2-final.fastq.gz + fastqc --noextract -f fastq -t !{task.cpus} !{sample}_R1-final.fastq.gz !{sample}_R2-final.fastq.gz +else + # Single-End Reads + gzip -cd !{fq[0]} | fastq-scan -g ${GENOME_SIZE} > !{sample}-final.json + ln -s !{fq[0]} !{sample}-final.fastq.gz + fastqc --noextract -f fastq -t !{task.cpus} !{sample}-final.fastq.gz +fi + +mkdir -p quality-control/summary-final +mv *.json quality-control/summary-final +mv *fastqc.html quality-control/summary-final +mv *fastqc.zip quality-control/summary-final + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/quality_control/qc_final_summary/test_params.yaml b/modules/utilities/quality_control/qc_final_summary/test_params.yaml new file mode 100644 index 000000000..0869dffc7 --- /dev/null +++ b/modules/utilities/quality_control/qc_final_summary/test_params.yaml @@ -0,0 +1,113 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +extra: + "test_data/empty.fna.gz" + +genome_size: + "test_data/genome-size.txt" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +no_cache: + "false" + +use_ena: + "false" + +coverage: + "100" + +max_retry: + " " + +sampleseed: + "42" + +skip_logs: + false + +adapters: + "null" + +phix: + "null" + +skip_qc: + false + +adapter_k: + "23" + +ktrim: + "r" + +mink: + "11" + +hdist: + "1" + +tpe: + "t" + +tbo: + "t" + +ftm: + "5" + +phix_k: + "null" + +qtrim: + "rl" + +trimq: + "6" + +minlength: + "35" + +maq: + "10" + +qout: + "33" + +tossjunk: + "t" + +skip_error_correction: + false + +keep_all_files: + "false" + +min_basepairs: + "2241820" + +min_reads: + "7472" diff --git a/modules/utilities/quality_control/qc_original_summary/README.md b/modules/utilities/quality_control/qc_original_summary/README.md new file mode 100644 index 000000000..0a5f2c701 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/README.md @@ -0,0 +1,17 @@ +# qc_original_summary process testing: + +This process run FASTQC on the input FASTQ files + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run qc_original_summary.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. \ No newline at end of file diff --git a/modules/utilities/quality_control/qc_original_summary/bin/build-containers.sh b/modules/utilities/quality_control/qc_original_summary/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/quality_control/qc_original_summary/bin/check-assembly-accession.py b/modules/utilities/quality_control/qc_original_summary/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/check-fastqs.py b/modules/utilities/quality_control/qc_original_summary/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/check-staging.py b/modules/utilities/quality_control/qc_original_summary/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/cleanup-coverage.py b/modules/utilities/quality_control/qc_original_summary/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/create-tool.sh b/modules/utilities/quality_control/qc_original_summary/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/free-disk-space.sh b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-docker-builds.py b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-build.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-citations.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-datasets.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-prepare.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-pull.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-search.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-summary.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-tools.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-versions.py b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/quality_control/qc_original_summary/bin/mask-consensus.py b/modules/utilities/quality_control/qc_original_summary/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/merge-blast-json.py b/modules/utilities/quality_control/qc_original_summary/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/mlst-blast.py b/modules/utilities/quality_control/qc_original_summary/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/quality_control/qc_original_summary/bin/select-references.py b/modules/utilities/quality_control/qc_original_summary/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/quality_control/qc_original_summary/bin/split-coverages.py b/modules/utilities/quality_control/qc_original_summary/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/quality_control/qc_original_summary/bin/update-conda.sh b/modules/utilities/quality_control/qc_original_summary/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_original_summary/bin/update-docker.sh b/modules/utilities/quality_control/qc_original_summary/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/quality_control/qc_original_summary/bin/update-tools.sh b/modules/utilities/quality_control/qc_original_summary/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_original_summary/bin/update-version.sh b/modules/utilities/quality_control/qc_original_summary/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_original_summary/nextflow.config b/modules/utilities/quality_control/qc_original_summary/nextflow.config new file mode 100644 index 000000000..57c28be18 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/nextflow.config @@ -0,0 +1,47 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: qc_original_summary { + conda = "${baseDir}/../../../../conda/envs/qc_reads-1.7.x"} + } + } + + docker { + process { + withName: qc_original_summary { + container = "ghcr.io/bactopia/qc_reads:1.6.0"} + + } + } + + test { + process { + echo = true + withName: qc_original_summary { + cpus = 2 + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + } +} diff --git a/modules/utilities/quality_control/qc_original_summary/qc_original_summary.nf b/modules/utilities/quality_control/qc_original_summary/qc_original_summary.nf new file mode 100644 index 000000000..3ba8da9b7 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/qc_original_summary.nf @@ -0,0 +1,47 @@ +nextflow.enable.dsl = 2 + +process QC_ORIGINAL_SUMMARY { + /* Run FASTQC on the input FASTQ files. */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "quality-control/*" + + input: + tuple val(sample), val(sample_type), val(single_end), path(fq), path(extra), path(genome_size) + + output: + file "quality-control/*" + file "${task.process}/*" optional true + + shell: + + template "qc_original_summary.sh" + + stub: + """ + mkdir quality-control + mkdir ${task.process} + touch quality-control/${sample} + touch ${task.process}/${sample} + """ +} + + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + path(params.fq), + path(params.extra), + path(params.genome_size) + ]) + + qc_original_summary(TEST_PARAMS_CH) +} diff --git a/modules/utilities/quality_control/qc_original_summary/templates/qc_original_summary.sh b/modules/utilities/quality_control/qc_original_summary/templates/qc_original_summary.sh new file mode 100644 index 000000000..e780d65d5 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/templates/qc_original_summary.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions + +echo "# FastQC Version" >> ${LOG_DIR}/!{task.process}.versions +fastqc -version>> ${LOG_DIR}/!{task.process}.versions 2>&1 + +echo "# fastq-scan Version" >> ${LOG_DIR}/!{task.process}.versions +fastq-scan -v >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{extra} --genome_size !{genome_size} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{extra} --genome_size !{genome_size} + fi +fi + +GENOME_SIZE=`head -n 1 !{genome_size}` +if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + gzip -cd !{fq[0]} | fastq-scan -g ${GENOME_SIZE} > !{sample}_R1-original.json + gzip -cd !{fq[1]} | fastq-scan -g ${GENOME_SIZE} > !{sample}_R2-original.json + ln -s !{fq[0]} !{sample}_R1-original.fastq.gz + ln -s !{fq[1]} !{sample}_R2-original.fastq.gz + fastqc --noextract -f fastq -t !{task.cpus} !{sample}_R1-original.fastq.gz !{sample}_R2-original.fastq.gz +else + # Single-End Reads + gzip -cd !{fq[0]} | fastq-scan -g ${GENOME_SIZE} > !{sample}-original.json + ln -s !{fq[0]} !{sample}-original.fastq.gz + fastqc --noextract -f fastq -t !{task.cpus} !{sample}-original.fastq.gz +fi + +mkdir -p quality-control/summary-original +mv *.json quality-control/summary-original +mv *fastqc.html quality-control/summary-original +mv *fastqc.zip quality-control/summary-original + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/quality_control/qc_original_summary/test_params.yaml b/modules/utilities/quality_control/qc_original_summary/test_params.yaml new file mode 100644 index 000000000..0869dffc7 --- /dev/null +++ b/modules/utilities/quality_control/qc_original_summary/test_params.yaml @@ -0,0 +1,113 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +extra: + "test_data/empty.fna.gz" + +genome_size: + "test_data/genome-size.txt" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + "some_value" + +no_cache: + "false" + +use_ena: + "false" + +coverage: + "100" + +max_retry: + " " + +sampleseed: + "42" + +skip_logs: + false + +adapters: + "null" + +phix: + "null" + +skip_qc: + false + +adapter_k: + "23" + +ktrim: + "r" + +mink: + "11" + +hdist: + "1" + +tpe: + "t" + +tbo: + "t" + +ftm: + "5" + +phix_k: + "null" + +qtrim: + "rl" + +trimq: + "6" + +minlength: + "35" + +maq: + "10" + +qout: + "33" + +tossjunk: + "t" + +skip_error_correction: + false + +keep_all_files: + "false" + +min_basepairs: + "2241820" + +min_reads: + "7472" diff --git a/modules/utilities/quality_control/qc_reads/README.md b/modules/utilities/quality_control/qc_reads/README.md new file mode 100644 index 000000000..cf9948d2b --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/README.md @@ -0,0 +1,14 @@ +# fastq_status process testing: + +This process Cleans the reads using Illumina-Cleanup + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run qc_reads.nf -entry test -params-file test_params.yaml -profile test diff --git a/modules/utilities/quality_control/qc_reads/bin/build-containers.sh b/modules/utilities/quality_control/qc_reads/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/quality_control/qc_reads/bin/check-assembly-accession.py b/modules/utilities/quality_control/qc_reads/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/quality_control/qc_reads/bin/check-fastqs.py b/modules/utilities/quality_control/qc_reads/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/quality_control/qc_reads/bin/check-staging.py b/modules/utilities/quality_control/qc_reads/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/quality_control/qc_reads/bin/cleanup-coverage.py b/modules/utilities/quality_control/qc_reads/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/quality_control/qc_reads/bin/create-tool.sh b/modules/utilities/quality_control/qc_reads/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_reads/bin/gh-actions/free-disk-space.sh b/modules/utilities/quality_control/qc_reads/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-docker-builds.py b/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-build.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-citations.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-datasets.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-prepare.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-pull.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-search.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-summary.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-tools.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-versions.py b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/quality_control/qc_reads/bin/mask-consensus.py b/modules/utilities/quality_control/qc_reads/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/quality_control/qc_reads/bin/merge-blast-json.py b/modules/utilities/quality_control/qc_reads/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/quality_control/qc_reads/bin/mlst-blast.py b/modules/utilities/quality_control/qc_reads/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/quality_control/qc_reads/bin/select-references.py b/modules/utilities/quality_control/qc_reads/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/quality_control/qc_reads/bin/split-coverages.py b/modules/utilities/quality_control/qc_reads/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/quality_control/qc_reads/bin/update-conda.sh b/modules/utilities/quality_control/qc_reads/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_reads/bin/update-docker.sh b/modules/utilities/quality_control/qc_reads/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/quality_control/qc_reads/bin/update-tools.sh b/modules/utilities/quality_control/qc_reads/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/quality_control/qc_reads/bin/update-version.sh b/modules/utilities/quality_control/qc_reads/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/quality_control/qc_reads/nextflow.config b/modules/utilities/quality_control/qc_reads/nextflow.config new file mode 100644 index 000000000..1262ab0ff --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/nextflow.config @@ -0,0 +1,50 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: qc_reads { + conda = "${baseDir}/../../../../conda/envs/qc_reads-1.7.x"} + } + } + + docker { + process { + withName: qc_reads { + container = "ghcr.io/bactopia/qc_reads:1.6.0"} + + } + } + + test { + process { + echo = true + withName: qc_reads { + cpus = 2 + memory = "5 GB" + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "SRR2838702" + LOG_DIR = "qc_reads/" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + + } +} diff --git a/modules/utilities/quality_control/qc_reads/qc_reads.nf b/modules/utilities/quality_control/qc_reads/qc_reads.nf new file mode 100644 index 000000000..f6c1d35fc --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/qc_reads.nf @@ -0,0 +1,65 @@ + +nextflow.enable.dsl = 2 + +process QC_READS { + /* Cleanup the reads using Illumina-Cleanup */ + tag "${sample}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "quality-control/*" + publishDir "${outdir}/${sample}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "*error.txt" + + input: + tuple val(sample), val(sample_type), val(single_end), path(fq), path(extra), path(genome_size) + + output: + file "*-error.txt" optional true + file "quality-control/*" + tuple val(sample), val(single_end), + path("quality-control/${sample}*.fastq.gz"),emit: READS,optional: true//,emit: COUNT_31MERS, ARIBA_ANALYSIS,MINMER_SKETCH, CALL_VARIANTS,MAPPING_QUERY optional true + tuple val(sample), val(sample_type), val(single_end), + path("quality-control/${sample}*.fastq.gz"), path(extra), + path(genome_size),emit: ASSEMBLY, optional: true + + tuple val(sample), val(single_end), + path("quality-control/${sample}*.{fastq,error-fq}.gz"), + path(genome_size),emit: QC_FINAL_SUMMARY, optional: true + file "${task.process}/*" optional true + + shell: + qc_ram = task.memory.toString().split(' ')[0] + is_assembly = sample_type.startsWith('assembly') ? true : false + qin = sample_type.startsWith('assembly') ? 'qin=33' : 'qin=auto' + adapters = params.adapters ? path(params.adapters) : 'adapters' + phix = params.phix ? path(params.phix) : 'phix' + + template "qc_reads.sh" + + stub: + """ + mkdir quality-control + mkdir ${task.process} + touch ${sample}-error.txt + touch quality-control/${sample}.fastq.gz + touch quality-control/${sample}.error-fq.gz + touch ${task.process}/${sample} + """ +} + + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.sample_type, + params.single_end, + path(params.fq), + path(params.extra), + path(params.genome_size) + ]) + qc_reads(TEST_PARAMS_CH) +} diff --git a/modules/utilities/quality_control/qc_reads/templates/qc_reads.sh b/modules/utilities/quality_control/qc_reads/templates/qc_reads.sh new file mode 100755 index 000000000..3ac43f544 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/templates/qc_reads.sh @@ -0,0 +1,229 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="qc_reads" +mkdir -p quality-control +mkdir -p ${LOG_DIR} +ERROR=0 +GENOME_SIZE=`head -n 1 !{genome_size}` +TOTAL_BP=$(( !{params.coverage}*${GENOME_SIZE} )) + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +echo "# Timestamp" > ${LOG_DIR}/qc_reads.versions +date --iso-8601=seconds >> ${LOG_DIR}/qc_reads.versions +echo "# BBMap (bbduk.sh, reformat.sh) Version" >> ${LOG_DIR}/qc_reads.versions +bbduk.sh --version 2>&1 | grep " version" >> ${LOG_DIR}/qc_reads.versions 2>&1 + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{extra} --genome_size !{genome_size} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{extra} --genome_size !{genome_size} + fi +fi + +if [ "!{params.skip_qc}" == "true" ]; then + echo "Sequence QC was skipped for !{sample}" > quality-control/!{sample}-qc-skipped.txt + if [[ -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + ln -s `readlink !{fq[0]}` quality-control/!{sample}_R1.fastq.gz + ln -s `readlink !{fq[1]}` quality-control/!{sample}_R2.fastq.gz + else + # Single-End Reads + ln -s `readlink !{fq[0]}` quality-control/!{sample}.fastq.gz + fi + else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + cp !{fq[0]} quality-control/!{sample}_R1.fastq.gz + cp !{fq[1]} quality-control/!{sample}_R2.fastq.gz + else + # Single-End Reads + cp !{fq[0]} quality-control/!{sample}.fastq.gz + fi + fi +else + if [ "!{single_end}" == "false" ]; then + # Paired-End Reads + # Remove Adapters + bbduk.sh -Xmx!{qc_ram}g \ + in=!{fq[0]} in2=!{fq[1]} \ + out=adapter-r1.fq out2=adapter-r2.fq \ + ref=!{adapters} \ + k=!{params.adapter_k} \ + ktrim=!{params.ktrim} \ + mink=!{params.mink} \ + hdist=!{params.hdist} \ + tpe=!{params.tpe} \ + tbo=!{params.tbo} \ + threads=!{task.cpus} \ + ftm=!{params.ftm} \ + !{qin} ordered=t \ + stats=${LOG_DIR}/bbduk-adapter.log 1> ${LOG_DIR}/bbduk-adapter.out 2> ${LOG_DIR}/bbduk-adapter.err + + # Remove PhiX + bbduk.sh -Xmx!{qc_ram}g \ + in=adapter-r1.fq in2=adapter-r2.fq \ + out=phix-r1.fq out2=phix-r2.fq \ + ref=!{phix} \ + k=!{params.phix_k} \ + hdist=!{params.hdist} \ + tpe=!{params.tpe} \ + tbo=!{params.tbo} \ + qtrim=!{params.qtrim} \ + trimq=!{params.trimq} \ + minlength=!{params.minlength} \ + minavgquality=!{params.maq} \ + !{qin} qout=!{params.qout} \ + tossjunk=!{params.tossjunk} \ + threads=!{task.cpus} \ + ordered=t \ + stats=${LOG_DIR}/bbduk-phix.log 1> ${LOG_DIR}/bbduk-phix.out 2> ${LOG_DIR}/bbduk-phix.err + + # Error Correction + if [ "!{params.skip_error_correction}" == "false" ]; then + echo "# Lighter Version" >> ${LOG_DIR}/qc_reads.versions + lighter -v >> ${LOG_DIR}/qc_reads.versions 2>&1 + lighter -od . -r phix-r1.fq -r phix-r2.fq -K 31 ${GENOME_SIZE} -maxcor 1 -zlib 0 -t !{task.cpus} 1> ${LOG_DIR}/lighter.out 2> ${LOG_DIR}/lighter.err + else + echo "Skipping error correction" + ln -s phix-r1.fq phix-r1.cor.fq + ln -s phix-r2.fq phix-r2.cor.fq + fi + + # Reduce Coverage + if (( ${TOTAL_BP} > 0 )); then + reformat.sh -Xmx!{qc_ram}g \ + in=phix-r1.cor.fq in2=phix-r2.cor.fq \ + out=subsample-r1.fq out2=subsample-r2.fq \ + samplebasestarget=${TOTAL_BP} \ + sampleseed=!{params.sampleseed} \ + overwrite=t 1> ${LOG_DIR}/reformat.out 2> ${LOG_DIR}/reformat.err + else + echo "Skipping coverage reduction" + ln -s phix-r1.cor.fq subsample-r1.fq + ln -s phix-r2.cor.fq subsample-r2.fq + fi + + # Compress + pigz -p !{task.cpus} -c -n subsample-r1.fq > quality-control/!{sample}_R1.fastq.gz + pigz -p !{task.cpus} -c -n subsample-r2.fq > quality-control/!{sample}_R2.fastq.gz + else + # Single-End Reads + # Remove Adapters + bbduk.sh -Xmx!{qc_ram}g \ + in=!{fq[0]} \ + out=adapter-r1.fq \ + ref=!{adapters} \ + k=!{params.adapter_k} \ + ktrim=!{params.ktrim} \ + mink=!{params.mink} \ + hdist=!{params.hdist} \ + tpe=!{params.tpe} \ + tbo=!{params.tbo} \ + threads=!{task.cpus} \ + ftm=!{params.ftm} \ + ordered=t \ + stats=${LOG_DIR}/bbduk-adapter.log 1> ${LOG_DIR}/bbduk-adapter.out 2> ${LOG_DIR}/bbduk-adapter.err + + # Remove PhiX + bbduk.sh -Xmx!{qc_ram}g \ + in=adapter-r1.fq \ + out=phix-r1.fq \ + ref=!{phix} \ + k=!{params.phix_k} \ + hdist=!{params.hdist} \ + tpe=!{params.tpe} \ + tbo=!{params.tbo} \ + qtrim=!{params.qtrim} \ + trimq=!{params.trimq} \ + minlength=!{params.minlength} \ + minavgquality=!{params.maq} \ + qout=!{params.qout} \ + tossjunk=!{params.tossjunk} \ + threads=!{task.cpus} \ + ordered=t \ + stats=${LOG_DIR}/bbduk-phix.log 1> ${LOG_DIR}/bbduk-phix.out 2> ${LOG_DIR}/bbduk-phix.err + + # Error Correction + if [ "!{params.skip_error_correction}" == "false" ]; then + echo "# Lighter Version" >> ${LOG_DIR}/qc_reads.versions + lighter -v >> ${LOG_DIR}/qc_reads.versions 2>&1 + lighter -od . -r phix-r1.fq -K 31 ${GENOME_SIZE} -maxcor 1 -zlib 0 -t !{task.cpus} 1> ${LOG_DIR}/lighter.out 2> ${LOG_DIR}/lighter.err + else + echo "Skipping error correction" + ln -s phix-r1.fq phix-r1.cor.fq + fi + + # Reduce Coverage + if (( ${TOTAL_BP} > 0 )); then + reformat.sh -Xmx!{qc_ram}g \ + in=phix-r1.cor.fq \ + out=subsample-r1.fq \ + samplebasestarget=${TOTAL_BP} \ + sampleseed=!{params.sampleseed} \ + overwrite=t 1> ${LOG_DIR}/reformat.out 2> ${LOG_DIR}/reformat.err + else + echo "Skipping coverage reduction" + ln -s phix-r1.cor.fq subsample-r1.fq + fi + + # Compress + pigz -p !{task.cpus} -c -n subsample-r1.fq > quality-control/!{sample}.fastq.gz + fi + + if [ "!{params.keep_all_files}" == "false" ]; then + # Remove intermediate FASTQ files + rm *.fq + fi +fi + +echo "# fastq-scan Version" >> ${LOG_DIR}/qc_reads.versions +fastq-scan -v >> ${LOG_DIR}/qc_reads.versions 2>&1 +FINAL_BP=`gzip -cd quality-control/*.gz | fastq-scan | grep "total_bp" | sed -r 's/.*:[ ]*([0-9]+),/\1/'` +if [ ${FINAL_BP} -lt "!{params.min_basepairs}" ]; then + ERROR=1 + echo "After QC, !{sample} FASTQ(s) contain ${FINAL_BP} total basepairs. This does + not exceed the required minimum !{params.min_basepairs} bp. Further analysis + is discontinued." | \ + sed 's/^\s*//' > !{sample}-low-sequence-depth-error.txt +fi + +FINAL_READS=`gzip -cd quality-control/*.gz | fastq-scan | grep "read_total" | sed -r 's/.*:[ ]*([0-9]+),/\1/'` +if [ ${FINAL_READS} -lt "!{params.min_reads}" ]; then + ERROR=1 + echo "After QC, !{sample} FASTQ(s) contain ${FINAL_READS} total reads. This does + not exceed the required minimum !{params.min_reads} reads count. Further analysis + is discontinued." | \ + sed 's/^\s*//' > !{sample}-low-read-count-error.txt +fi + +if [ "!{is_assembly}" == "true" ]; then + touch quality-control/reads-simulated-from-assembly.txt +fi + +if [ "${ERROR}" -eq "1" ]; then + if [ "!{single_end}" == "false" ]; then + mv quality-control/!{sample}_R1.fastq.gz quality-control/!{sample}_R1.error-fq.gz + mv quality-control/!{sample}_R2.fastq.gz quality-control/!{sample}_R2.error-fq.gz + else + mv quality-control/!{sample}.fastq.gz quality-control/!{sample}.error-fq.gz + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/qc_reads.err + cp .command.out ${LOG_DIR}/qc_reads.out + cp .command.sh ${LOG_DIR}/qc_reads.sh || : + cp .command.trace ${LOG_DIR}/qc_reads.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/quality_control/qc_reads/test_params.yaml b/modules/utilities/quality_control/qc_reads/test_params.yaml new file mode 100644 index 000000000..28fe9ab83 --- /dev/null +++ b/modules/utilities/quality_control/qc_reads/test_params.yaml @@ -0,0 +1,119 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +extra: + "test_data/empty.fna.gz" + +genome_size: + "test_data/genome-size.txt" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +no_cache: + false + +use_ena: + false + +coverage: + '100' + +max_retry: + 3 + +sampleseed: + 42 + +skip_logs: + false + +adapters: + null + +phix: + null + +skip_qc: + false + +adapter_k: + 23 + +ktrim: + 'r' + +mink: + 11 + +hdist: + 1 + +tpe: + 't' + +tbo: + 't' + +ftm: + 5 + +phix_k: + 31 + +qtrim: + 'rl' + +trimq: + 6 + +minlength: + 35 + +maq: + 10 + +qout: + 33 + +tossjunk: + 't' + +skip_error_correction: + false + +keep_all_files: + false + +min_basepairs: + '2241820' + +min_reads: + '7472' + + + + + + diff --git a/modules/utilities/sequence_type/README.md b/modules/utilities/sequence_type/README.md new file mode 100644 index 000000000..af3d40654 --- /dev/null +++ b/modules/utilities/sequence_type/README.md @@ -0,0 +1,16 @@ +# sequence_type process testing: + +This process Determine MLST types using ARIBA and BLAST +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run sequence_type.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/utilities/sequence_type/bin/build-containers.sh b/modules/utilities/sequence_type/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/utilities/sequence_type/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/utilities/sequence_type/bin/check-assembly-accession.py b/modules/utilities/sequence_type/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/utilities/sequence_type/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/utilities/sequence_type/bin/check-fastqs.py b/modules/utilities/sequence_type/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/utilities/sequence_type/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/utilities/sequence_type/bin/check-staging.py b/modules/utilities/sequence_type/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/utilities/sequence_type/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/utilities/sequence_type/bin/cleanup-coverage.py b/modules/utilities/sequence_type/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/utilities/sequence_type/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/utilities/sequence_type/bin/create-tool.sh b/modules/utilities/sequence_type/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/utilities/sequence_type/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/utilities/sequence_type/bin/gh-actions/free-disk-space.sh b/modules/utilities/sequence_type/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/utilities/sequence_type/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/utilities/sequence_type/bin/gh-actions/setup-bactopia-env.sh b/modules/utilities/sequence_type/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/utilities/sequence_type/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/utilities/sequence_type/bin/gh-actions/setup-docker-builds.py b/modules/utilities/sequence_type/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/utilities/sequence_type/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-build.py b/modules/utilities/sequence_type/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-citations.py b/modules/utilities/sequence_type/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-datasets.py b/modules/utilities/sequence_type/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-prepare.py b/modules/utilities/sequence_type/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-pull.py b/modules/utilities/sequence_type/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-search.py b/modules/utilities/sequence_type/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-summary.py b/modules/utilities/sequence_type/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-tools.py b/modules/utilities/sequence_type/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/utilities/sequence_type/bin/helpers/bactopia-versions.py b/modules/utilities/sequence_type/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/utilities/sequence_type/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/utilities/sequence_type/bin/mask-consensus.py b/modules/utilities/sequence_type/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/utilities/sequence_type/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/utilities/sequence_type/bin/merge-blast-json.py b/modules/utilities/sequence_type/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/utilities/sequence_type/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/utilities/sequence_type/bin/mlst-blast.py b/modules/utilities/sequence_type/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/utilities/sequence_type/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/utilities/sequence_type/bin/select-references.py b/modules/utilities/sequence_type/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/utilities/sequence_type/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/utilities/sequence_type/bin/split-coverages.py b/modules/utilities/sequence_type/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/utilities/sequence_type/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/utilities/sequence_type/bin/update-conda.sh b/modules/utilities/sequence_type/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/utilities/sequence_type/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/sequence_type/bin/update-docker.sh b/modules/utilities/sequence_type/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/utilities/sequence_type/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/utilities/sequence_type/bin/update-tools.sh b/modules/utilities/sequence_type/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/utilities/sequence_type/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/utilities/sequence_type/bin/update-version.sh b/modules/utilities/sequence_type/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/utilities/sequence_type/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/utilities/sequence_type/nextflow.config b/modules/utilities/sequence_type/nextflow.config new file mode 100644 index 000000000..5dcd5b005 --- /dev/null +++ b/modules/utilities/sequence_type/nextflow.config @@ -0,0 +1,48 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + conda { + process { + withName: sequence_type { + conda = "${baseDir}/../../../conda/envs/sequence_type-1.7.x"} + } + } + + docker { + process { + withName: sequence_type { + container = "ghcr.io/bactopia/sequence_type:1.6.0"} + + } + } + + test { + process { + echo = true + withName: sequence_type { + cpus = 2 + queue = 'long' + } + + } + env { + MLST_DATABASES = ["MLST"] + VERSION = "1.6.0" + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "false" + run_type = "fastqs" + } + + } +} diff --git a/modules/utilities/sequence_type/sequence_type.nf b/modules/utilities/sequence_type/sequence_type.nf new file mode 100644 index 000000000..a6aa2247c --- /dev/null +++ b/modules/utilities/sequence_type/sequence_type.nf @@ -0,0 +1,60 @@ +nextflow.enable.dsl = 2 + +process SEQUENCE_TYPE { + /* Determine MLST types using ARIBA and BLAST */ + tag "${sample} - ${schema} - ${method}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/mlst/${schema}", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${method}/*" + + input: + tuple val(sample), val(single_end), path(fq), path(assembly) + each path(dataset) + + output: + file "${method}/*" + file "${task.process}/*" optional true + + when: + MLST_DATABASES.isEmpty() == false + + shell: + method = dataset =~ /.*blastdb.*/ ? 'blast' : 'ariba' + dataset_tarball = path(dataset).getName() + dataset_name = dataset_tarball.replace('.tar.gz', '').split('-')[1] + schema = dataset_tarball.split('-')[0] + noclean = params.ariba_no_clean ? "--noclean" : "" + spades_options = params.spades_options ? "--spades_options '${params.spades_options}'" : "" + + template "sequence_type.sh" + + stub: + method = dataset =~ /.*blastdb.*/ ? 'blast' : 'ariba' + dataset_tarball = path(dataset).getName() + schema = dataset_tarball.split('-')[0] + """ + mkdir ${method} + mkdir ${task.process} + touch ${method}/${sample} + touch ${task.process}/${sample} + """ +} + +//############### +//Module testing +//############### + +workflow test{ + + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + path(params.assembly) + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.dataset_blast) + path(params.dataset_ariba)) + + sequence_type(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect()) +} diff --git a/modules/utilities/sequence_type/templates/sequence_type.sh b/modules/utilities/sequence_type/templates/sequence_type.sh new file mode 100644 index 000000000..812b8487d --- /dev/null +++ b/modules/utilities/sequence_type/templates/sequence_type.sh @@ -0,0 +1,60 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +tar -xzvf !{dataset_tarball} +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}-!{method}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}-!{method}.versions + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --assembly !{assembly} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --assembly !{assembly} + fi +fi + +if [ "!{method}" == "blast" ]; then + echo "# mlst-blast.py Version" >> ${LOG_DIR}/!{task.process}-!{method}.versions + mlst-blast.py --version >> ${LOG_DIR}/!{task.process}-!{method}.versions 2>&1 + mkdir -p blast + if [[ !{params.compress} == "true" ]]; then + mlst-blast.py !{assembly} !{dataset_name} blast/!{sample}-blast.json \ + --cpu !{task.cpus} --compressed + else + mlst-blast.py !{assembly} !{dataset_name} blast/!{sample}-blast.json \ + --cpu !{task.cpus} + fi +elif [ "!{method}" == "ariba" ]; then + if [ "!{single_end}" == "false" ]; then + echo "# Ariba Version" >> ${LOG_DIR}/!{task.process}-!{method}.versions + ariba version >> ${LOG_DIR}/!{task.process}-!{method}.versions 2>&1 + mv !{dataset_name}/ref_db ./ + ariba run ref_db !{fq[0]} !{fq[1]} ariba \ + --nucmer_min_id !{params.nucmer_min_id} \ + --nucmer_min_len !{params.nucmer_min_len} \ + --nucmer_breaklen !{params.nucmer_breaklen} \ + --assembly_cov !{params.assembly_cov} \ + --min_scaff_depth !{params.min_scaff_depth} \ + --assembled_threshold !{params.assembled_threshold} \ + --gene_nt_extend !{params.gene_nt_extend} \ + --unique_threshold !{params.unique_threshold} \ + --threads !{task.cpus} \ + --force \ + --verbose !{noclean} !{spades_options} + else + mkdir -p ariba + echo "Ariba cannot be run on single end reads" > ariba/ariba-not-run.txt + fi +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}-!{method}.err + cp .command.out ${LOG_DIR}/!{task.process}-!{method}.out + cp .command.sh ${LOG_DIR}/!{task.process}-!{method}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}-!{method}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/utilities/sequence_type/test_params.yaml b/modules/utilities/sequence_type/test_params.yaml new file mode 100644 index 000000000..045f3ce4c --- /dev/null +++ b/modules/utilities/sequence_type/test_params.yaml @@ -0,0 +1,71 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +sample_type: + "paired-end" + +single_end: + "false" + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +dataset_blast: + "test_data/default-blastdb.tar.gz" + +dataset_ariba: + "test_data/default-ariba.tar.gz" + +assembly: + "test_data/SRR2838702.fna.gz" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +spades_options: + null + +ariba_no_clean: + false + +compress: + false + +nucmer_min_id: + 90 + +nucmer_min_len: + 20 + +nucmer_breaklen: + 200 + +assembly_cov: + 50 + +min_scaff_depth: + 10 + +assembled_threshold: + 0.95 + +gene_nt_extend: + 30 + +unique_threshold: + 0.03 + +skip_logs: + false diff --git a/modules/variant_calling/call_variants/README.md b/modules/variant_calling/call_variants/README.md new file mode 100644 index 000000000..b6ca747ab --- /dev/null +++ b/modules/variant_calling/call_variants/README.md @@ -0,0 +1,17 @@ +# call_variants process testing: + +This process identifies variants (SNPs/InDels) against a set of reference genomess using Snippy. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run call_variants.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/variant_calling/call_variants/bin/build-containers.sh b/modules/variant_calling/call_variants/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/variant_calling/call_variants/bin/check-assembly-accession.py b/modules/variant_calling/call_variants/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/variant_calling/call_variants/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/variant_calling/call_variants/bin/check-fastqs.py b/modules/variant_calling/call_variants/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/variant_calling/call_variants/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/variant_calling/call_variants/bin/check-staging.py b/modules/variant_calling/call_variants/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/variant_calling/call_variants/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/variant_calling/call_variants/bin/cleanup-coverage.py b/modules/variant_calling/call_variants/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/variant_calling/call_variants/bin/create-tool.sh b/modules/variant_calling/call_variants/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/variant_calling/call_variants/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/variant_calling/call_variants/bin/gh-actions/free-disk-space.sh b/modules/variant_calling/call_variants/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/variant_calling/call_variants/bin/gh-actions/setup-bactopia-env.sh b/modules/variant_calling/call_variants/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/variant_calling/call_variants/bin/gh-actions/setup-docker-builds.py b/modules/variant_calling/call_variants/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-build.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-citations.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-datasets.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-prepare.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-pull.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-search.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-summary.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-tools.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/variant_calling/call_variants/bin/helpers/bactopia-versions.py b/modules/variant_calling/call_variants/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/variant_calling/call_variants/bin/mask-consensus.py b/modules/variant_calling/call_variants/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/variant_calling/call_variants/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/variant_calling/call_variants/bin/merge-blast-json.py b/modules/variant_calling/call_variants/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/variant_calling/call_variants/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/variant_calling/call_variants/bin/mlst-blast.py b/modules/variant_calling/call_variants/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/variant_calling/call_variants/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/variant_calling/call_variants/bin/select-references.py b/modules/variant_calling/call_variants/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/variant_calling/call_variants/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/variant_calling/call_variants/bin/split-coverages.py b/modules/variant_calling/call_variants/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/variant_calling/call_variants/bin/update-conda.sh b/modules/variant_calling/call_variants/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/variant_calling/call_variants/bin/update-docker.sh b/modules/variant_calling/call_variants/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/variant_calling/call_variants/bin/update-tools.sh b/modules/variant_calling/call_variants/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/variant_calling/call_variants/bin/update-version.sh b/modules/variant_calling/call_variants/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/variant_calling/call_variants/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/variant_calling/call_variants/call_variants.nf b/modules/variant_calling/call_variants/call_variants.nf new file mode 100644 index 000000000..03f040027 --- /dev/null +++ b/modules/variant_calling/call_variants/call_variants.nf @@ -0,0 +1,56 @@ +nextflow.enable.dsl = 2 + +process CALL_VARIANTS { + /* + Identify variants (SNPs/InDels) against a set of reference genomes + using Snippy. + */ + tag "${sample} - ${reference_name}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/variants/user", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${reference_name}/*" + + input: + tuple val(sample), val(single_end), path(fq) + each path(reference) + + output: + path "${reference_name}/*" + path "${task.process}/*" optional true + + when: + REFERENCES.isEmpty() == false + + shell: + snippy_ram = task.memory.toString().split(' ')[0] + reference_name = reference.getSimpleName() + fastq = single_end ? "--se ${fq[0]}" : "--R1 ${fq[0]} --R2 ${fq[1]}" + bwaopt = params.bwaopt ? "--bwaopt 'params.bwaopt'" : "" + fbopt = params.fbopt ? "--fbopt 'params.fbopt'" : "" + template "call_variants.sh" + + stub: + reference_name = reference.getSimpleName() + """ + mkdir ${reference_name} + mkdir ${task.process} + touch ${reference_name}/* + touch ${task.process}/* + """ +} + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + ]) + TEST_PARAMS_CH2 = Channel.of( + path(params.reference) + ) + call_variants(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect) +} diff --git a/modules/variant_calling/call_variants/nextflow.config b/modules/variant_calling/call_variants/nextflow.config new file mode 100644 index 000000000..5a5ff9ffb --- /dev/null +++ b/modules/variant_calling/call_variants/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: call_variants { + conda = "${baseDir}/../../../conda/envs/call_variants-1.7.x"} + } + } + + docker { + process { + withName: call_variants { + container = "ghcr.io/bactopia/call_variants:1.6.0"} + + } + } + test { + process { + withName: call_variants { + cpus = 2 + memory = "10 GB" + queue = 'long' + } + + } + env { + REFERENCES = ["reference"] + VERSION = "1.6.0" + snippy_ram = 2 + outdir = "test_output" + sample = "TEST_SAMPLE" + final_sample_type = "paired-end" + single_end = "test" + run_type = "fastqs" + } + + } +} diff --git a/modules/variant_calling/call_variants/templates/call_variants.sh b/modules/variant_calling/call_variants/templates/call_variants.sh new file mode 100644 index 000000000..4c72123ad --- /dev/null +++ b/modules/variant_calling/call_variants/templates/call_variants.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# Snippy Version" >> ${LOG_DIR}/!{task.process}.versions +snippy --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{reference} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{reference} + fi +fi + +snippy !{fastq} \ + --ref !{reference} \ + --cpus !{task.cpus} \ + --ram !{snippy_ram} \ + --outdir !{reference_name} \ + --prefix !{sample} \ + --mapqual !{params.mapqual} \ + --basequal !{params.basequal} \ + --mincov !{params.mincov} \ + --minfrac !{params.minfrac} \ + --minqual !{params.minqual} \ + --maxsoft !{params.maxsoft} !{bwaopt} !{fbopt} > ${LOG_DIR}/snippy.out 2> ${LOG_DIR}/snippy.err + +# Add GenBank annotations +echo "# vcf-annotator Version" >> ${LOG_DIR}/!{task.process}.versions +vcf-annotator --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +vcf-annotator !{reference_name}/!{sample}.vcf !{reference} > !{reference_name}/!{sample}.annotated.vcf 2> ${LOG_DIR}/vcf-annotator.err + +# Get per-base coverage +echo "# bedtools Version" >> ${LOG_DIR}/!{task.process}.versions +bedtools --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +grep "^##contig" !{reference_name}/!{sample}.vcf > !{reference_name}/!{sample}.full-coverage.txt +genomeCoverageBed -ibam !{reference_name}/!{sample}.bam -d >> !{reference_name}/!{sample}.full-coverage.txt 2> ${LOG_DIR}/genomeCoverageBed.err +cleanup-coverage.py !{reference_name}/!{sample}.full-coverage.txt > !{reference_name}/!{sample}.coverage.txt +rm !{reference_name}/!{sample}.full-coverage.txt + +# Mask low coverage regions +mask-consensus.py !{sample} !{reference_name} \ + !{reference_name}/!{sample}.consensus.subs.fa \ + !{reference_name}/!{sample}.subs.vcf \ + !{reference_name}/!{sample}.coverage.txt \ + --mincov !{params.mincov} > !{reference_name}/!{sample}.consensus.subs.masked.fa 2> ${LOG_DIR}/mask-consensus.err + +# Clean Up +rm -rf !{reference_name}/reference !{reference_name}/ref.fa* !{reference_name}/!{sample}.vcf.gz* + +if [[ !{params.compress} == "true" ]]; then + find !{reference_name}/ -type f -not -name "*.bam*" -and -not -name "*.log*" -and -not -name "*.txt*" | \ + xargs -I {} pigz -n --best -p !{task.cpus} {} + pigz -n --best -p !{task.cpus} !{reference_name}/!{sample}.coverage.txt +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/variant_calling/call_variants/test_params.yaml b/modules/variant_calling/call_variants/test_params.yaml new file mode 100644 index 000000000..ad573aaf3 --- /dev/null +++ b/modules/variant_calling/call_variants/test_params.yaml @@ -0,0 +1,59 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +reference: + "test_data/SRR2838702.gbk" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +snippy_ram: + 2 + +mapqual: + 60 + +basequal: + 13 + +mincov: + 10 + +minfrac: + 0 + +minqual: + 100 + +maxsoft: + 10 + +bwaopt: + null + +fbopt: + null + +compress: + false + +skip_logs: + false diff --git a/modules/variant_calling/call_variants_auto/README.md b/modules/variant_calling/call_variants_auto/README.md new file mode 100644 index 000000000..6ae21983e --- /dev/null +++ b/modules/variant_calling/call_variants_auto/README.md @@ -0,0 +1,17 @@ +# call_variants_auto process testing: + +This process identifies variants (SNPs/InDels) against one or more reference genomes selected based on their Mash distance from the input. + +## About testing this process: + +Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens: +- the local files in `test_data` +- params in `test_params.yaml` +- `test` profile in `nextflow.config` + +## How to test it: + +$ nextflow run call_variants_auto.nf -params-file test_params.yaml -profile test,docker -entry test + + +if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. diff --git a/modules/variant_calling/call_variants_auto/bin/build-containers.sh b/modules/variant_calling/call_variants_auto/bin/build-containers.sh new file mode 100755 index 000000000..b5a900295 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/build-containers.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# build-containers +# +# Automate the building of Bactopia related containers +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function singularity_build { + recipe=$1 + name=$2 + image=$3 + version=$4 + latest=${5:-0} + + echo "Working on ${recipe}" + singularity build -F ${image} ${recipe} + singularity sign ${image} + singularity push ${image} library://rpetit3/bactopia/${name}:${version} + + if [[ "${latest}" == "1" ]]; then + singularity push ${image} library://rpetit3/bactopia/${name}:latest + fi +} + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${recipe}" + docker build --rm -t ${image} -f ${recipe} . + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + docker tag ${image} ${latest} + docker push ${latest} + fi +} + + +if [[ $# == 0 ]]; then + echo "" + echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR" + echo "" + echo "Example Command" + echo "build-containers.sh /home/bactopia/bactopia container-images/ " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +OUTPUT_DIR=${2:-"./"} +if [ -z ${BACTOPIA_DIR} ]; then + echo "Got ${#} arguement" + echo "Must give the path to Bactopia repository" + exit 1 +fi +MAJOR_VERSION=${3:-"0"} + +mkdir -p ${OUTPUT_DIR} + +# Build Bactopia containers +#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1 +#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +if [ "${MAJOR_VERSION}" == "1" ]; then + # Build Singularity + for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do + recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Singularity//') + recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg" + singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION} + done + + # Build Docker + docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest + for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + #docker_build ${recipe_path} ${recipe_image} + done + + # Build Bactopia Tools containers + for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + #docker_build ${docker_file} ${docker_image} + + singularity_file="${recipe_path}/Singularity" + singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg" + singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION} + done +fi diff --git a/modules/variant_calling/call_variants_auto/bin/check-assembly-accession.py b/modules/variant_calling/call_variants_auto/bin/check-assembly-accession.py new file mode 100755 index 000000000..4201849ef --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/check-assembly-accession.py @@ -0,0 +1,79 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "check-assembly-accession" +VERSION = "1.6.0" + + +def check_assembly_version(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaCheckAssemblyAccession" + + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + if len(record["IdList"]): + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + else: + + return [f'No records found for {accession}', True] + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available' + ) + ) + + parser.add_argument( + 'reference', metavar="STR", type=str, + help='NCBI Assembly accession to be tested.' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + reference = args.reference.split('.')[0] + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}. Reason: {current_accession}', + file=sys.stderr + ) + else: + print(f'Using {current_accession} for {args.reference}', file=sys.stderr) + print(current_accession) diff --git a/modules/variant_calling/call_variants_auto/bin/check-fastqs.py b/modules/variant_calling/call_variants_auto/bin/check-fastqs.py new file mode 100755 index 000000000..a4188745b --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/check-fastqs.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" +import sys + + +def read_json(json_file): + import json + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + + +def write_error(filename, error_msg): + print(error_msg, file=sys.stderr) + with open(filename, "wt") as fh_out: + fh_out.write(error_msg) + return 1 + + +def check_reads(fq1, sample, min_reads, fq2=None): + error = 0 + total_reads = fq1 + fq2 if fq2 else fq1 + + if total_reads < min_reads: + error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n" + f"exceed the required minimum {min_reads} read count. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-read-count-error.txt', error_msg) + + if fq2: + if fq1 != fq2: + # different number of reads in the pair + error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n" + "investigate these FASTQs. Further analysis is discontinued.\n") + error += write_error(f'{sample}-different-read-count-error.txt', error_msg) + + return error + + +def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None): + error = 0 + total_bp= fq1 + fq2 if fq2 else fq1 + + if total_bp < min_basepairs: + error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n" + f"exceed the required minimum {min_basepairs} bp. Further analysis is \n" + "discontinued.\n") + error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg) + + if fq2: + proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1) + if proportion < min_proportion: + # More basepairs in one sample that exceeds minimum proportion + error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n" + f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n" + f"R2 having {fq2} bp. Further analysis is discontinued.\n") + error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg) + + return error + + +if __name__ == '__main__': + import argparse as ap + import os + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.') + parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.') + parser.add_argument('--min_proportion', metavar="FLOAT", type=float, + help='The proportion of sequenced basepairs that R1 and R2 must be') + parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.') + parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + error = 0 + if args.fq1 and args.fq2: + # Paired end + r1 = read_json(args.fq1) + r2 = read_json(args.fq2) + error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, + fq2=r2["qc_stats"]["read_total"]) + error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, + fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion) + + else: + se = read_json(args.fq1) + error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads) + error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs) + + sys.exit(error) diff --git a/modules/variant_calling/call_variants_auto/bin/check-staging.py b/modules/variant_calling/call_variants_auto/bin/check-staging.py new file mode 100755 index 000000000..2396b944f --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/check-staging.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python3 +""" +Sometimes with AWS, files might fail to download but not cause an error. +This script checks to verify all expected inputs are staged. +""" +PROGRAM = "check-staging" +VERSION = "1.6.0" + + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.' + ) + ) + + parser.add_argument('--fq1', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--fq2', metavar="STR", type=str, help='Either SE or R1 Fastq.') + parser.add_argument('--extra', metavar="STR", type=str, help='Extra files') + parser.add_argument('--genome_size', metavar="STR", type=str, help='Genome size text file') + parser.add_argument('--assembly', metavar="STR", type=str, help='Genome assembly.') + parser.add_argument('--is_single', action='store_true', help='Input FASTQ is single end') + parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + if not args.is_single and args.fq2 == "null": + # This is an issue, both files are not present + sys.exit(80) + + if args.fq1: + if not os.path.exists(args.fq1): + sys.exit(81) + + if args.fq2: + if not os.path.exists(args.fq2): + sys.exit(82) + + if args.extra: + if args.extra != "empty.fna.gz": + if not os.path.exists(args.extra): + sys.exit(90) + + if args.genome_size: + if not os.path.exists(args.genome_size): + sys.exit(91) + + if args.assembly: + if not os.path.exists(args.assembly): + sys.exit(92) diff --git a/modules/variant_calling/call_variants_auto/bin/cleanup-coverage.py b/modules/variant_calling/call_variants_auto/bin/cleanup-coverage.py new file mode 100755 index 000000000..98b131cd8 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "1.6.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/modules/variant_calling/call_variants_auto/bin/create-tool.sh b/modules/variant_calling/call_variants_auto/bin/create-tool.sh new file mode 100755 index 000000000..d629191ba --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/create-tool.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# build-containers +# +# Create a blank tool. +VERSION=1.6.0 + +if [[ $# == 0 ]]; then + echo "" + echo "create-tool.sh BACTOPIA_DIR TOOL_NAME TOOL_DESCRIPTION" + echo "" + echo "Example Command" + echo "create-tool.sh /home/bactopia/bactopia roary 'Create a pan-genome with Roary and an optional core-genome phylogeny with IQTree.' " + echo "" + exit +fi + +BACTOPIA_DIR=$1 +TOOL=$2 +DESCRIPTION=$3 +if [ -z "${BACTOPIA_DIR}" ] || [ -z "${TOOL}" ] || [ -z "${DESCRIPTION}" ]; then + echo "Got ${#} arguement" + echo "Must give a path to Bactopia repository, tool name and tool description." + exit 1 +fi + +if [ ! -d "${BACTOPIA_DIR}/tools/${TOOL}" ]; then + cp -r ${BACTOPIA_DIR}/tools/.skeleton ${BACTOPIA_DIR}/tools/${TOOL} + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Dockerfile + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/Singularity + sed -i -r 's/TOOL_NAME/'"${TOOL}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config + sed -i -r 's/DESCRIPTION/'"${DESCRIPTION}"'/' ${BACTOPIA_DIR}/tools/${TOOL}/nextflow.config +else + echo "${TOOL} exists already, please verify. Not going to replace, exiting..." + exit 1 +fi diff --git a/modules/variant_calling/call_variants_auto/bin/gh-actions/free-disk-space.sh b/modules/variant_calling/call_variants_auto/bin/gh-actions/free-disk-space.sh new file mode 100755 index 000000000..3ebc27d75 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/gh-actions/free-disk-space.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Robert Metzger +# Github: https://github.com/rmetzger +# Link: https://github.com/apache/flink/blob/master/tools/azure-pipelines/free_disk_space.sh + +# +# The Azure provided machines typically have the following disk allocation: +# Total space: 85GB +# Allocated: 67 GB +# Free: 17 GB +# This script frees up 28 GB of disk space by deleting unneeded packages and +# large directories. +# The Flink end to end tests download and generate more than 17 GB of files, +# causing unpredictable behavior and build failures. +# +echo "==============================================================================" +echo "Freeing up disk space on CI system" +echo "==============================================================================" + +echo "Listing 100 largest packages" +dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 +df -h +echo "Removing large packages" +sudo apt-get remove -y '^ghc-8.*' +sudo apt-get remove -y '^dotnet-.*' +sudo apt-get remove -y '^llvm-.*' +sudo apt-get remove -y 'php.*' +sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel +sudo apt-get autoremove -y +sudo apt-get clean +df -h +echo "Removing large directories" +# deleting 15GB +rm -rf /usr/share/dotnet/ +df -h diff --git a/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-bactopia-env.sh b/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-bactopia-env.sh new file mode 100755 index 000000000..682bf0508 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-bactopia-env.sh @@ -0,0 +1,66 @@ +#! /bin/bash +# Setup Bactopia environment +# ./setup-bactopia-env.sh /path/to/conda/ /path/to/bactopia is_github_action +set -e +set -x +CONDA_PATH=${1:-"/opt/conda"} +WORK_DIR=${2:-"/bactopia"} +IS_GITHUB=${3:-"0"} +IS_GITLAB=${4:-"0"} +ENV=${5:-"bactopia"} +CONDA_CMD="create -n ${ENV}" +if [[ "${IS_GITHUB}" == "1" ]]; then + CONDA_CMD="install" +elif [[ "${IS_GITLAB}" != "0" ]]; then + CONDA_CMD="create --prefix ${IS_GITLAB}" +fi + +# Create environment +conda ${CONDA_CMD} --quiet -y -c conda-forge -c bioconda \ + ariba \ + beautifulsoup4 \ + biopython \ + "blast>=2.10.0" \ + "bowtie2<2.4.0" \ + cd-hit \ + conda \ + coreutils \ + executor \ + lxml \ + mash \ + ncbi-amrfinderplus \ + ncbi-genome-download \ + nextflow \ + "pysam>=0.15.3" \ + "python>3.6" \ + requests \ + sed \ + unzip \ + wget + +# Setup variables +BACTOPIA=${CONDA_PATH}/envs/${ENV} +chmod 755 ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* +cp ${WORK_DIR}/bactopia ${WORK_DIR}/bin/helpers/* ${BACTOPIA}/bin +VERSION=`${BACTOPIA}/bin/bactopia version | cut -d " " -f 2` +BACTOPIA_VERSION="${VERSION%.*}.x" +BACTOPIA_SHARE="${BACTOPIA}/share/bactopia-${BACTOPIA_VERSION}/" +mkdir -p ${BACTOPIA_SHARE} + +# Copy files +cp -R \ + ${WORK_DIR}/bin \ + ${WORK_DIR}/conda \ + ${WORK_DIR}/conf \ + ${WORK_DIR}/data \ + ${WORK_DIR}/templates \ + ${WORK_DIR}/tools \ + ${WORK_DIR}/main.nf \ + ${WORK_DIR}/nextflow.config \ + ${BACTOPIA_SHARE} + +# Clean up +if [[ "${IS_GITHUB}" == "0" && "${IS_GITLAB}" == "0" ]]; then + rm -rf /bactopia + conda clean -y -a +fi diff --git a/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-docker-builds.py b/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-docker-builds.py new file mode 100755 index 000000000..a10c60944 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/gh-actions/setup-docker-builds.py @@ -0,0 +1,249 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [--github] [--quay] [--max_retry INT] [--force] + [--verbose] [--silent] [--version] + STR STR + +setup-docker-builds.py - Build Docker containers for use by Bactopia + +positional arguments: + STR Directory containing Bactopia repository + STR JSON file with latest releases + +optional arguments: + -h, --help show this help message and exit + --github Push to GitHub container registry. + --quay Push to Quay.io container registry. + --max_retry INT Maximum times to attemp creating Conda environment. + (Default: 5) + --force Force rebuild of Docker containers. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import glob +import json +import logging +import os +import sys + +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "setup-docker-builds.py" +VERSION = "1.6.0" +REPO = "bactopia" +MAX_RETRY = 5 +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + logging.log(STDERR, e) + return None + + +def get_previous_version(json_file): + """Get the previous version of Bactopia.""" + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + + for node in json_data['repository']['releases']['nodes']: + this_version = node['name'].lstrip('v') + if this_version != VERSION: + return this_version + + +def check_md5sum(current_md5, image): + """Compare the two md5 files to see if a rebuild is needed.""" + current = None + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + previous = None + data = json.loads(execute(f'skopeo inspect docker://docker.io/{image}', capture=True)) + if data: + if 'conda.md5' in data['Labels']: + previous = data['Labels']['conda.md5'] + logging.info(f'Found {previous} from {image}') + + logging.info(f'Testing {current} == {previous}') + return previous == current + + +def docker_push(image): + """Push Docker image, with multiple attempts incase of failure.""" + import time + retry = 0 + allow_fail = False + success = False + logging.info(f'Push on {image}') + while not success: + result = execute(f'docker push {image}') + if not result: + if retry > MAX_RETRY: + allow_fail = True + retry += 1 + logging.log(STDERR, "Retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return True + + +def docker_retag(previous, current, github=False, quay=False): + """Pull previous version's container, apply current versions to tag.""" + execute(f'docker pull {previous}') + execute(f'docker tag {previous} {current}') + docker_push(current) + + if github: + execute(f'docker tag {previous} ghcr.io/{current}') + docker_push(f'ghcr.io/{current}') + if quay: + execute(f'docker tag {previous} quay.io/{current}') + docker_push(f'quay.io/{current}') + + +def docker_tag(image, tag): + """Tag and push Docker container.""" + logging.info(f'Tagging {tag} to {image}') + execute(f'docker tag {image} {tag}') + docker_push(f'{tag}') + + +def docker_build(recipe, image, latest=None, github=False, quay=False): + """Build and push latest Docker container.""" + logging.info(f'Building on {image}') + execute(f'docker build --rm -t {image} -f {recipe} .') + docker_push(f'{image}') + + if latest: + docker_tag(image, latest) + + if github: + docker_tag(image, f'ghcr.io/{image}') + if latest: + docker_tag(image, f'ghcr.io/{latest}') + + if quay: + docker_tag(image, f'quay.io/{image}') + if latest: + docker_tag(image, f'quay.io/{latest}') + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Docker containers for use by Bactopia' + ) + ) + + parser.add_argument('bactopia', metavar="STR", type=str, + help='Directory containing Bactopia repository') + parser.add_argument('releases', metavar="STR", type=str, + help='JSON file with latest releases') + parser.add_argument('--github', action='store_true', + help='Push to GitHub container registry.') + parser.add_argument('--quay', action='store_true', + help='Push to Quay.io container registry.') + parser.add_argument('--force', action='store_true', + help='Force rebuild of Docker containers.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + major, minor, patch = VERSION.split('.') + previous_version = get_previous_version(args.releases) + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + bactopia_path = args.bactopia.rstrip("/") + + # Bactopia Dockerfile + logging.info(f'Working on Bactopia Dockerfile') + docker_build(f'{bactopia_path}/Dockerfile', f'{REPO}/bactopia:{VERSION}', latest=f'{REPO}/bactopia:latest', + github=args.github, quay=args.quay) + + # Bactopia Process Dockerfiles + process_files = sorted(glob.glob(f'{bactopia_path}/containers/*.Dockerfile')) + for i, dockerfile in enumerate(process_files): + logging.info(f'Working on {dockerfile} ({i+1} of {len(process_files)})') + process_name = os.path.splitext(os.path.basename(dockerfile))[0] + latest_image = f'{REPO}/{process_name}:{VERSION}' + previous_image = f'{REPO}/{process_name}:{previous_version}' + if check_md5sum(f"{bactopia_path}/conda/linux/{process_name}.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) + + # Bactopia Tools Dockerfiles + tools = sorted(glob.glob(f'{bactopia_path}/tools/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + tool_path = f"{bactopia_path}/tools/{tool}" + dockerfile = f'{tool_path}/Dockerfile' + latest_image = f'{REPO}/tools-{tool}:{VERSION}' + previous_image = f'{REPO}/tools-{tool}:{previous_version}' + logging.info(f'Working on {dockerfile} ({i+1} of {len(tools)})') + if check_md5sum(f"{tool_path}/environment-linux.md5", previous_image) and not args.force: + # MD5s match, just need to retag + logging.info(f'Conda environment did not change, adding tag to previous version') + docker_retag(previous_image, latest_image, github=args.github, quay=args.quay) + else: + # Need to rebuild + logging.info(f'Conda environment changed, will need to rebuild container') + docker_build(dockerfile, latest_image, github=args.github, quay=args.quay) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-build.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-build.py new file mode 100755 index 000000000..b2ca50eda --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-build.py @@ -0,0 +1,239 @@ +#! /usr/bin/env python3 +""" +usage: bactopia build [-h] [-e STR] [--force] [--verbose] [--silent] + [--version] + STR STR + +bactopia build - Build Conda environments for use by Bactopia + +positional arguments: + STR Directory containing Conda environment files to build. + STR Directory to install Conda environments to. + +optional arguments: + -h, --help show this help message and exit + -e STR, --ext STR Extension of the Conda environment files. Default: .yml + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia build" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def check_needs_build(observed_md5, expected_md5, prefix, force=False, is_bactopia=False): + """Check if a new environment needs to be built.""" + needs_build = False + if os.path.exists(observed_md5) and not force: + if check_md5sum(expected_md5, observed_md5): + if not is_bactopia: + logging.info(f'Existing env ({prefix}) found, skipping unless --force is used') + else: + needs_build = True + logging.info(f'Existing env ({prefix}) is out of sync, it will be updated') + else: + needs_build = True + return needs_build + + +def build_conda_env(env_file, prefix, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'conda env create -f {env_file} --prefix {prefix} {force}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating Conda environment, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + print(e, file=sys.stderr) + sys.exit(e.returncode) + else: + return None + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + + parser = ap.ArgumentParser( + prog='bactopia build', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Conda environments for use by Bactopia' + ) + ) + + parser.add_argument('conda_envs', metavar="STR", type=str, + help='Directory containing Conda environment files to build.') + + parser.add_argument('install_path', metavar="STR", type=str, + help='Directory to install Conda environments to.') + parser.add_argument( + '-e', '--ext', metavar='STR', type=str, + default="yml", + help='Extension of the Conda environment files. Default: .yml' + ) + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Conda environment with the given name') + parser.add_argument('--default', action='store_true', + help='Builds Conda environments to the default Bactopia location.') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--include_tools', action='store_true', + help='Builds Conda environments for Bactopia tools as well.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + env_path = f'{os.path.abspath(os.path.expanduser(args.conda_envs))}/{ostype}' + install_path = os.path.abspath(os.path.expanduser(args.install_path)) + finish_file = f'{install_path}/envs-built-{CONTAINER_VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Conda environments in {install_path}, if a complete rebuild is needed please use --force') + + env_files = sorted(glob.glob(f'{env_path}/*.{args.ext}')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.splitext(os.path.basename(env_file))[0] + md5_file = env_file.replace('.yml', '.md5') + prefix = f'{install_path}/{envname}-{CONTAINER_VERSION}' + envbuilt_file = f'{install_path}/{envname}-{CONTAINER_VERSION}/env-built.txt' + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(env_files)}), begin build to {prefix}') + + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') + execute(f'touch {install_path}/envs-built-{CONTAINER_VERSION}.txt') + else: + logging.error(f'Unable to find Conda *.{args.ext} files in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + env_file = f'{tool_path}/{tool}/environment-{ostype}.yml' + if os.path.exists(env_file): + md5_file = f'{tool_path}/{tool}/environment-{ostype}.md5' + prefix = f'{install_path}/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{prefix}/env-built.txt' + force = '--force' if args.force else '' + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(envbuilt_file, md5_file, prefix, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {env_file} ({i+1} of {len(tools)}), begin build to {prefix}') + built = build_conda_env(env_file, prefix, max_retry=args.max_retry, force=args.force, is_bactopia=args.is_bactopia) + if built: + execute(f'cp {md5_file} {envbuilt_file}') diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-citations.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-citations.py new file mode 100755 index 000000000..56531a9e7 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-citations.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +usage: bactopia citations [-h] [--bactopia STR] [--version] STR + +bactopia citations - Prints the citations of datasets and tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia citations" +DESCRIPTION = 'Prints the citations of datasets and tools used by Bactopia' + +def validate_args(bactopia_repo): + bactopia_citations = f'{bactopia_repo}/data/bactopia-datasets-software.txt' + if not os.path.exists(bactopia_citations): + print(f"cannot access '{bactopia_citations}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + citations = {} + with open(bactopia_citations, 'rt') as citation_fh: + for line in citation_fh: + line.rstrip() + if not line.startswith('name'): + name, ref_type, citation = line.split('\t') + if ref_type not in citations: + citations[ref_type] = [] + citations[ref_type].append({'name':name, 'citation': citation}) + return citations + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-datasets.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-datasets.py new file mode 100755 index 000000000..dc52ef623 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-datasets.py @@ -0,0 +1,1293 @@ +#! /usr/bin/env python3 +""" +usage: bactopia datasets [-h] [--outdir STR] [--skip_ariba] [--ariba STR] + [--species STR] [--skip_mlst] [--skip_prokka] + [--include_genus] + [--asssembly_level {all,complete,chromosome,scaffold,contig}] + [--limit INT] [--accessions STR] [--identity FLOAT] + [--overlap FLOAT] [--max_memory INT] [--fast_cluster] + [--skip_minmer] [--skip_plsdb] [--prodigal_tf STR] + [--reference STR] [--mapping STR] [--genes STR] + [--proteins STR] [--primers STR] [--force_optional] + [--cpus INT] [--clear_cache] [--force] + [--force_ariba] [--force_mlst] [--force_prokka] + [--force_minmer] [--force_plsdb] [--keep_files] + [--available_datasets] [--depends] [--version] + [--verbose] [--silent] + PUBMLST + +bactopia datasets - Setup public datasets for Bactopia + +positional arguments: + PUBMLST Bactopia config file with PubMLST schema mappings for + Ariba. + +optional arguments: + -h, --help show this help message and exit + --outdir STR Directory to write output. (Default ./datasets) + +Ariba Reference Datasets: + --skip_ariba Skip setup of Ariba datasets + --ariba STR Comma separated list of Ariba datasets to download and + setup. Available datasets include: argannot, card, + ncbi, megares, plasmidfinder, resfinder, + srst2_argannot, vfdb_core, vfdb_full, virulencefinder + (Default: "vfdb_core,card") Use --available_datasets + to see the full list. + +Bacterial Species: + --species STR Download available MLST schemas and completed genomes + for a given species or a list of species in a text + file. + --skip_mlst Skip setup of MLST schemas for each species + +Custom Prokka Protein FASTA: + --skip_prokka Skip creation of a Prokka formatted fasta for each + species + --include_genus Include all genus members in the Prokka proteins FASTA + --assembly_level {all,complete,chromosome,scaffold,contig} + Assembly levels of genomes to download (Default: + complete). + --limit INT If available completed genomes exceeds a given limit, + a random subsample will be taken. (Default 1000) + --accessions STR A list of RefSeq accessions to download. + --identity FLOAT CD-HIT (-c) sequence identity threshold. (Default: + 0.9) + --overlap FLOAT CD-HIT (-s) length difference cutoff. (Default: 0.8) + --max_memory INT CD-HIT (-M) memory limit (in MB). (Default: unlimited + --fast_cluster Use CD-HIT's (-g 0) fast clustering algorithm, instead + of the accurate but slow algorithm. + +Minmer Datasets: + --skip_minmer Skip download of pre-computed minmer datasets (mash, + sourmash) + +PLSDB (Plasmid) BLAST/Sketch: + --skip_plsdb Skip download of pre-computed PLSDB datbases (blast, + mash) + +Optional User Provided Datasets: + --prodigal_tf STR A pre-built Prodigal training file to add to the + species annotation folder. Requires a single species + (--species) and will replace existing training files. + --reference STR A reference genome (FASTA/GenBank (preferred)) file or + directory to be added to the optional folder for + variant calling. Requires a single species + (--species). + --mapping STR A reference sequence (FASTA) file or directory to be + added to the optional folder for mapping. Requires a + single species (--species). + --genes STR A gene sequence (FASTA) file or directory to be added + to the optional folder for BLAST. Requires a single + species (--species). + --proteins STR A protein sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --primers STR A primer sequence (FASTA) file or directory to be + added to the optional folder for BLAST. Requires a + single species (--species). + --force_optional Overwrite any existing files in the optional folders + +Custom Options: + --cpus INT Number of cpus to use. (Default: 1) + --clear_cache Remove any existing cache. + --force Forcibly overwrite existing datasets. + --force_ariba Forcibly overwrite existing Ariba datasets. + --force_mlst Forcibly overwrite existing MLST datasets. + --force_prokka Forcibly overwrite existing Prokka datasets. + --force_minmer Forcibly overwrite existing minmer datasets. + --force_plsdb Forcibly overwrite existing PLSDB datasets. + --keep_files Keep all downloaded and intermediate files. + --available_datasets List Ariba reference datasets and MLST schemas + available for setup. + --depends Verify dependencies are installed. + +Adjust Verbosity: + --version show program's version number and exit + --verbose Print debug related text. + --silent Only critical errors will be printed. + +example usage: + bactopia datasets + bactopia datasets --ariba 'vfdb_core' + bactopia datasets --species 'Staphylococcus aureus' --include_genus +""" +import glob +import json +import logging +import os +import sys + +from Bio import SeqIO +from executor import ExternalCommand, ExternalCommandFailed + +PROGRAM = "bactopia datasets" +VERSION = "1.6.0" +STDOUT = 11 +STDERR = 12 +CACHE_DIR = f'{os.path.expanduser("~")}/.bactopia' +CACHE_JSON = f'{CACHE_DIR}/datasets.json' +EXPIRATION = 15 # Refresh db info if cache is older than 15 days +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def check_cache(clear_cache=False): + """Check if a local cache exists to avoid re-downloads.""" + import time + + logging.debug(f'Checking for existing cache') + if not os.path.exists(CACHE_DIR): + logging.debug(f'Creating cache directory ({CACHE_DIR})') + execute(f'mkdir -p {CACHE_DIR}') + + cache_data = {} + if os.path.exists(CACHE_JSON): + logging.debug(f'Found existing dataset cache ({CACHE_JSON})') + days_old = (time.time() - os.path.getctime(CACHE_JSON)) // (24 * 3600) + if days_old >= EXPIRATION or clear_cache: + logging.debug((f'Deleting {CACHE_JSON}, Reason: older than ' + f'{EXPIRATION} days or "--clear_cache" used')) + execute(f'rm {CACHE_JSON}') + else: + with open(CACHE_JSON, 'r') as cache_fh: + cache_data = json.load(cache_fh) + + return cache_data + + +def get_available_datasets(pubmlst_file, clear_cache): + """Get a list of available datasets to be set up.""" + data = check_cache(clear_cache=clear_cache) + expected = ['ariba', 'pubmlst'] + if sum([k in data for k in expected]) != len(expected): + logging.debug((f'Existing dataset cache ({CACHE_JSON}) is missing ' + 'expected fields, refreshing.')) + data = { + 'ariba': sorted(ariba_datasets()), + 'pubmlst': pubmlst_schemas(pubmlst_file) + } + + with open(CACHE_JSON, 'w') as cache_fh: + logging.debug(f'Created dataset cache ({CACHE_JSON})') + json.dump(data, cache_fh, indent=4, sort_keys=True) + + return [data['ariba'], data['pubmlst']] + + +def validate_requirements(): + """Validate the required programs are available, if not exit (1).""" + from shutil import which + programs = { + 'ariba': which('ariba'), 'makeblastdb': which('makeblastdb'), + 'cd-hit': which('cd-hit'), 'wget': which('wget'), + 'unzip': which('unzip'), 'gzip': which('gzip') + # 'mentalist': which('mentalist') + } + + missing = False + for prog, path in programs.items(): + if path: + logging.debug(f'{prog}: command found.') + else: + logging.error(f'{prog}: command not found.') + missing = True + + if missing: + logging.error("Requirement missing, exiting") + sys.exit(1) + + +def validate_species(species): + """Query input species against ENA to determine if it exists.""" + import requests + ENDPOINT = 'https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name' + checks = [] + + if os.path.exists(species): + with open(species, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + checks.append(line) + elif "," in species: + checks = species.split(',') + else: + checks.append(species) + + species_key = {} + for species in checks: + species = species.strip() + r = requests.get(f'{ENDPOINT}/{species}?limit=1') + if r.status_code == requests.codes.ok: + try: + json_data = r.json() + if json_data[0]['scientificName'].lower() != species.lower(): + # Error! Species/Organism found, but doesn't match input. This shouldn't + # (query is case-insensitive exact match) happen, but my grandma could " + # probably trigger it, so here it is! + logging.error((f'Input species ({species}) does not match return result ' + f'({json_data[0]["scientificName"]}), please check spelling.')) + sys.exit(1) + + species_key[species.lower()] = json_data[0]['scientificName'] + logging.info(f'{species} verified in ENA Taxonomy database') + except json.decoder.JSONDecodeError: + if r.text == "No results.": + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + else: + # Error! Species/Organism not found. Check spelling? + # TODO: Implement"Did you mean?" function + logging.error(f'Input species ({species}) not found, please check spelling.') + sys.exit(1) + + return species_key + + +def ariba_datasets(): + """Print a list of datasets available with 'ariba getref'.""" + getref_usage = ' '.join([ + line.strip() for line in + execute('ariba getref --help', capture=True).strip().split('\n') + ]) + datasets = getref_usage.split('of: ')[1].split(' outprefix')[0] + return datasets.split() + + +def pubmlst_schemas(pubmlst_file): + """Read the PubMLST mappings and return a dict.""" + pubmlst = {} + with open(pubmlst_file, 'rt') as pubmlst_fh: + for line in pubmlst_fh: + line = line.rstrip() + if line and not line.startswith('ariba'): + ariba, species, schema = line.split('\t') + if species not in pubmlst: + pubmlst[species] = {} + pubmlst[species][schema] = ariba + return pubmlst + + +def available_datasets(ariba, pubmlst, missing=False): + """Print available Ariba references, MLST schemas, and exit.""" + print_to = sys.stderr if missing else sys.stdout + print("Ariba reference datasets available:", file=print_to) + print("\n".join(sorted(ariba)), file=print_to) + + print("\nMLST schemas available from pubMLST.org:", file=print_to) + for k,v in sorted(pubmlst.items()): + if len(v) > 1: + print(f'{k} ({len(v)} shemas)', file=print_to) + else: + print(f'{k}', file=print_to) + sys.exit(1 if missing else 0) + + +def setup_requests(request, available_datasets, title, skip_check=False): + """Return a list of setup requests.""" + datasets = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + dataset = line.rstrip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif "," in request: + for dataset in request.split(','): + dataset = dataset.strip() + if dataset in available_datasets or skip_check: + datasets.append(dataset) + else: + logging.error(f'{dataset} is not available from {title}') + elif request in available_datasets or skip_check: + datasets.append(request) + else: + logging.error(f'{request} is not available from {title}') + + return datasets + + +def setup_ariba(request, available_datasets, outdir, force=False, + keep_files=False): + """Setup each of the requested datasets using Ariba.""" + requests = setup_requests(request, available_datasets, 'ariba') + if requests: + ariba_dir = f'{outdir}/ariba' + for request in requests: + prefix = f'{ariba_dir}/{request}' + if os.path.exists(f'{prefix}-updated.txt'): + if force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + else: + logging.info(f'{request} ({prefix}) exists, skipping') + continue + elif force: + logging.info(f'--force, removing existing {request} setup') + execute(f'rm -rf {prefix}*') + + # Setup Ariba dataset + logging.info(f'Setting up {request} Ariba dataset') + fa = f'{prefix}.fa' + tsv = f'{prefix}.tsv' + execute(f'mkdir -p {ariba_dir}') + with open(f'{prefix}-log.txt', 'w') as ariba_log: + execute( + f'ariba getref {request} {request}', + stdout_file=ariba_log, stderr_file=ariba_log, + directory=ariba_dir + ) + execute(f'ariba prepareref -f {fa} -m {tsv} {prefix}') + + # Clean up + if not keep_files: + execute(f'rm {fa} {tsv}') + execute(f'mv {request}*.* {request}/', directory=ariba_dir) + execute(f'tar -zcvf {request}.tar.gz {request}/', + directory=ariba_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {request}-updated.txt', + directory=ariba_dir) + execute(f'rm -rf {request}', directory=ariba_dir) + else: + logging.info("No valid Ariba datasets to setup, skipping") + + +def setup_mlst_request(request, available_schemas, species_key=None): + """Return a list of mlst schemas to build.""" + requests = [] + if os.path.exists(request): + with open(request, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + requests.append(line) + elif "," in request: + for dataset in request.split(','): + requests.append(dataset.capitalize().strip()) + else: + requests.append(request.capitalize()) + + schemas = [] + for species in requests: + species = species_key[species.lower()] + genus = species.split()[0] + if species in available_schemas: + for schema, ariba_name in available_schemas[species].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + elif genus in available_schemas: + # MLST schema is for a genus not just species + for schema, ariba_name in available_schemas[genus].items(): + schemas.append({'ariba': ariba_name, 'schema': schema, 'species': species}) + else: + logging.error(f'{species} is not available from pubMLST.org, skipping') + + return schemas + +def setup_mlst(request, available_datasets, outdir, force=False, species_key=None): + """Setup MLST datasets for each requested schema.""" + import re + requests = setup_mlst_request(request, available_datasets, species_key=species_key) + if requests: + for request in requests: + schema = request['schema'] + species = request['species'] + + species = re.sub(r'[ /()]', "-", species.lower()) + species = species.replace('--', '-').strip('-') + mlst_dir = f'{outdir}/{species}/mlst/{schema}' + if os.path.exists(f'{mlst_dir}/mlst-updated.txt'): + if force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + else: + logging.info((f'{request["species"]} MLST Schema ({mlst_dir}) exists' + ', skipping')) + continue + elif force: + logging.info(f'--force, removing existing {request["species"]} setup') + execute(f'rm -rf {mlst_dir}') + + # Setup MLST dataset + logging.info(f'Setting up {schema} MLST schema for {request["species"]}') + execute(f'mkdir -p {mlst_dir}') + + # Ariba + species_request = request['ariba'] + logging.info(f'Creating Ariba MLST dataset') + ariba_dir = f'{mlst_dir}/ariba' + execute(f'ariba pubmlstget "{species_request}" {ariba_dir}') + + # BLAST + logging.info(f'Creating BLAST MLST dataset') + blast_dir = f'{mlst_dir}/blastdb' + for fasta in glob.glob(f'{ariba_dir}/pubmlst_download/*.tfa'): + output = os.path.splitext(fasta)[0] + execute(f'makeblastdb -in {fasta} -dbtype nucl -out {output}') + execute(f'mv {ariba_dir}/pubmlst_download {blast_dir}') + + # Tarball directories + execute(f'tar -zcvf {schema}-ariba.tar.gz ariba/', directory=mlst_dir) + execute(f'rm -rf {ariba_dir}') + execute(f'tar -zcvf {schema}-blastdb.tar.gz blastdb/', directory=mlst_dir) + execute(f'rm -rf {blast_dir}') + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > mlst-updated.txt', + directory=mlst_dir) + else: + logging.info("No valid MLST schemas to setup, skipping") + + +def process_cds(cds): + """Look over the CDS attributes and return passing CDS.""" + header = None + seq = None + qualifiers = cds.keys() + ec_number = '' + gene = '' + product = '' + is_pseudo = ('pseudo' in qualifiers or 'pseudogene' in qualifiers) + is_hypothetical = (product.lower() == "hypothetical protein") + if not is_pseudo and not is_hypothetical: + if 'ec_number' in qualifiers: + ec_number = cds['ec_number'][0] + if 'gene' in qualifiers: + gene = cds['gene'][0] + if 'product' in qualifiers: + product = cds['product'][0] + if 'protein_id' in qualifiers: + protein_id = cds['protein_id'][0] + elif 'locus_tag' in qualifiers: + protein_id = cds['locus_tag'][0] + + header = f'>{protein_id} {ec_number}~~~{gene}~~~{product}' + seq = cds['translation'][0] + + + return [header, seq] + + +def setup_prokka(request, available_datasets, outdir, force=False, + include_genus=False, limit=None, user_accessions=None, identity=0.9, + overlap=0.8, max_memory=0, fast_cluster=False, keep_files=False, + cpus=1, species_key=None, assembly_level='complete'): + """ + Setup a Prokka compatible protein fasta file based on completed genomes. + + Implemented similar approach as Thanh Lê's "make_prokka_db". Check out + his version for a standalone implementation! + Github Repo: https://github.com/thanhleviet/make_prokka_db + """ + import gzip + import re + import random + from statistics import median, mean + requests = None + if os.path.exists(request): + requests = setup_requests(request, available_datasets, 'Prokka Proteins', + skip_check=True) + else: + requests = setup_requests(request.capitalize(), available_datasets, 'Prokka Proteins', + skip_check=True) + if requests: + for request in requests: + species = re.sub(r'[ /()]', "-", request.lower()) + species = species.replace('--', '-').strip('-') + prokka_dir = f'{outdir}/{species}/annotation' + minmer_dir = f'{outdir}/{species}/minmer' + clean_up = False + genome_sizes = [] + skip_genome_size = False + + if os.path.exists(f'{prokka_dir}/proteins.faa'): + if force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + else: + logging.info((f'{prokka_dir} exists, skipping')) + continue + elif os.path.exists(f'{prokka_dir}/'): + logging.info(f'Incomplete setup, deleting {prokka_dir} to start over') + clean_up = True + elif force: + logging.info(f'--force, delete existing {prokka_dir}') + clean_up = True + + if clean_up: + execute(f'rm -rf {prokka_dir}') + execute(f'rm -rf {minmer_dir}') + + # Setup Prokka proteins file + logging.info(f'Setting up custom Prokka proteins for {request}') + execute(f'mkdir -p {prokka_dir}') + execute(f'mkdir -p {minmer_dir}') + + # Download completed genomes + logging.info(f'Downloading genomes (assembly level: {assembly_level})') + genome_dir = f'{prokka_dir}/genomes' + genus = species_key[request.lower()] + execute(f'mkdir {genome_dir}') + species_accession = [] + all_accessions = {} + accessions = [] + accession_file = f'{genome_dir}/accessions.txt' + if user_accessions: + execute(f'cp {user_accessions} {accession_file}') + if include_genus: + logging.info(f'Ignoring `--include_genus` since a file of accessions was given.') + if limit: + logging.info(f'Ignoring `--limit {limit}` since a file of accessions was given.') + else: + if include_genus: + genus = genus.split()[0] + + results = execute((f'ncbi-genome-download bacteria -g "{genus}" ' + f'-l {assembly_level} -F genbank -r 80 --dry-run'), capture=True, error_ok=True) + + if results: + for line in results.split('\n'): + if line and not line.startswith('Considering'): + accession, name = line.split('\t', 1) + all_accessions[accession] = name + if name.startswith(species_key[request.lower()]): + species_accession.append(accession) + accessions.append(accession) + + if limit: + if len(accessions) > limit: + logging.info(f'Downloading {limit} genomes from a random subset of {len(accessions)} genomes.') + accessions = random.sample(accessions, limit) + contains_species = False + for accession in accessions: + if all_accessions[accession].startswith(species_key[request.lower()]): + contains_species = True + + if not contains_species: + if len(species_accession): + logging.info(f'Random subset, does not include {species_key[request.lower()]} genomes, adding 1 to random subset.') + accessions.append(random.sample(species_accession, 1)[0]) + else: + logging.info(f'There are less available genomes than the given limit ({limit}), downloading all.') + + if not len(species_accession): + logging.info(f'A completed genome does not exist for {species_key[request.lower()]}, skipping genome size statistics..') + skip_genome_size = True + + with open(accession_file, 'w') as accession_fh: + for accession in accessions: + accession_fh.write(f'{accession}\n') + else: + logging.error(f'No completed genomes found for "{genus}", skipping custom Prokka proteins') + continue + + execute((f'ncbi-genome-download bacteria -A {accession_file} ' + f'-l complete -o {prokka_dir}/genomes -F genbank -r 80 ' + f'-m {prokka_dir}/ncbi-metadata.txt')) + + # Extract information from Genbank files + genbank_files = execute( + 'find . -name "*.gbff.gz"', directory=prokka_dir, capture=True + ).split('\n') + count = 0 + passing_cds = f'{prokka_dir}/passing-cds.faa' + minmer = f'{minmer_dir}/minmer.ffn' + logging.info(f'Processing {len(genbank_files)-1} Genbank files') + with open(passing_cds, 'w') as cds_fh, open(minmer, 'w') as ffn_fh: + for genbank in genbank_files: + if genbank: + sizes = [] + genbank = genbank.replace('./', f'{prokka_dir}/') + seq_name = None + seqs = [] + gap = "N" * 102 + with gzip.open(genbank, 'rt') as genbank_fh: + for record in SeqIO.parse(genbank_fh, 'genbank'): + # Aggregate chromosome and plasmids + sizes.append(len(record.seq)) + for dbxref in record.dbxrefs: + if dbxref.startswith('Assembly'): + seq_name = dbxref.split(':')[1] + seqs.append(str(record.seq)) + seqs.append(gap) + + for feature in record.features: + if feature.type == 'CDS': + header, seq = process_cds( + feature.qualifiers + ) + + if header and seq: + count += 1 + cds_fh.write(f'{header}\n') + cds_fh.write(f'{seq}\n') + # Write sequence + ffn_fh.write(f'>{seq_name}\n') + gap = "N" * 102 + sequence = "".join(seqs) + ffn_fh.write(f'{sequence}\n') + + # Only add genome sizes for the species, incase the + # option '--inlude_genus' was used. + if not skip_genome_size: + if record.annotations["organism"].lower().startswith(request.lower()): + logging.debug( + f'Added {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + 'calculation.' + ) + genome_sizes.append(sum(sizes)) + else: + logging.debug( + f'Skip adding {record.annotations["organism"]} ' + f'({sum(sizes)}) to median genome size ' + f'calculation (not {request}).' + ) + + total_genome = len(genome_sizes) + if not skip_genome_size: + median_genome = int(median(genome_sizes)) + logging.info( + f'Median genome size: {median_genome} (n={total_genome})' + ) + cdhit_cds = f'{prokka_dir}/proteins.faa' + logging.info(f'Running CD-HIT on {count} proteins') + g = 0 if fast_cluster else 1 + execute((f'cd-hit -i {passing_cds} -o {cdhit_cds} -s {overlap} ' + f'-g {g} -c {identity} -T {cpus} -M {max_memory}')) + + # Make sketch/signatures + execute( + f'mash sketch -i -k 31 -s 10000 -o refseq-genomes minmer.ffn', + directory=minmer_dir + ) + + # Finish up + with open(f'{prokka_dir}/genome_size.json', 'w') as genome_size_fh: + gs_dict = { + 'min': 0, 'median': 0, 'mean':0, 'max': 0, 'total': 0, + 'description': 'No available completed genomes.' + } + if not skip_genome_size: + gs_dict = { + 'min': min(genome_sizes), + 'median': int(median(genome_sizes)), + 'mean': int(median(genome_sizes)), + 'max': max(genome_sizes), + 'total': total_genome, + 'description': ( + f'Genome size values are based on {total_genome} ' + 'completed genomes (RefSeq).' + ) + } + json.dump(gs_dict, genome_size_fh, indent=4) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > proteins-updated.txt', + directory=prokka_dir) + execute(f'grep -H -c "^>" *.faa > cdhit-stats.txt', + directory=prokka_dir) + execute(f'sed -i "s=passing-cds.faa:=original\t=" cdhit-stats.txt', + directory=prokka_dir) + execute( + f'sed -i "s=proteins.faa:=after_cd-hit\t=" cdhit-stats.txt', + directory=prokka_dir + ) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + # Clean up + if not keep_files: + execute(f'rm -rf {minmer} {passing_cds} {genome_dir}/') + + else: + logging.info("No valid species to setup, skipping") + + +def setup_amr(outdir, force=False): + """Download the latest antimicrobial resistance datasets.""" + datasets = ['amrfinder'] + amr_dir = f'{outdir}/antimicrobial-resistance' + update_timestamp = False + execute(f'mkdir -p {amr_dir}') + + for dataset in datasets: + dataset_file = f'{amr_dir}/{dataset}.tar.gz' + if os.path.exists(dataset_file): + if force: + logging.info(f'--force, removing existing {dataset_file} setup') + execute(f'rm -f {dataset_file}') + update_timestamp = True + else: + logging.info(f'{dataset_file} exists, skipping') + continue + + if dataset == 'amrfinder': + logging.info(f'Setting up latest AMRFinder+ database') + prefix = 'amrfinderdb' + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'mkdir -p {prefix} {prefix}-temp', directory=amr_dir) + execute(f'amrfinder_update -d {prefix}-temp', directory=amr_dir) + latest_db = os.readlink(f'{amr_dir}/{prefix}-temp/latest') + execute(f'mv {latest_db}/* {prefix}/', directory=amr_dir) + execute(f'tar -czvf {prefix}.tar.gz {prefix}/', directory=amr_dir) + execute(f'rm -rf {prefix} {prefix}-temp', directory=amr_dir) + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > {prefix}-updated.txt', directory=amr_dir) + logging.info(f'AMRFinder+ database saved to {amr_dir}/{prefix}.tar.gz') + + +def setup_minmer(outdir, force=False): + """Download precomputed Refseq (Mash) and Genbank (Sourmash) datasets.""" + datasets = { + # Last updated: 2019-03-04 + 'genbank-k21.json.gz': 'https://osf.io/d7rv8/download', + 'genbank-k31.json.gz': 'https://osf.io/4f8n3/download', + 'genbank-k51.json.gz': 'https://osf.io/nemkw/download', + 'refseq-k21-s1000.msh': ( + 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + ) + } + + minmer_dir = f'{outdir}/minmer' + update_timestamp = False + if force: + logging.info(f'--force, removing existing {minmer_dir} setup') + execute(f'rm -rf {minmer_dir}') + + execute(f'mkdir -p {minmer_dir}') + for filename, url in datasets.items(): + filepath = f'{minmer_dir}/{filename}' + if os.path.exists(filepath): + if force: + logging.info(f'--force, removing existing {filepath} setup') + execute(f'rm -rf {filepath}') + update_timestamp = True + else: + logging.info(f'{filepath} exists, skipping') + continue + + execute(f'wget --quiet -O {filename} {url}', directory=minmer_dir) + + # Finish up + if update_timestamp or not os.path.exists(f'{minmer_dir}/minmer-updated.txt'): + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > minmer-updated.txt', + directory=minmer_dir) + + +def setup_plsdb(outdir, keep_files=False, force=False): + """Download precomputed PLSDB datasets.""" + url = 'https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/?zip' + plsdb_dir = f'{outdir}/plasmid' + if os.path.exists(plsdb_dir): + if force: + logging.info(f'--force, removing existing {plsdb_dir} setup') + execute(f'rm -rf {plsdb_dir}') + else: + logging.info(f'{plsdb_dir} exists, skipping') + return None + + execute(f'mkdir -p {plsdb_dir}') + execute(f'wget --quiet -O plsdb.zip {url}', directory=plsdb_dir) + execute('unzip plsdb.zip', directory=plsdb_dir) + execute('ls > plsdb-orginal-names.txt', directory=plsdb_dir) + + # Rename files to generic prefix + mash_file = os.path.basename(glob.glob(f'{plsdb_dir}/*.msh')[0]) + prefix = mash_file.replace('.msh', '') + for plsdb_file in os.listdir(plsdb_dir): + if plsdb_file.startswith(prefix) and prefix != 'plsdb': + new_name = plsdb_file.replace(prefix, 'plsdb') + execute(f'mv {plsdb_file} {new_name}', directory=plsdb_dir) + + # Clean up + if not keep_files: + execute('rm plsdb.zip', directory=plsdb_dir) + + # Finish up + execute(f'date -u +"%Y-%m-%dT%H:%M:%SZ" > plsdb-updated.txt', + directory=plsdb_dir) + + +def create_summary(outdir, training_set=False): + """Create a summary of available datasets in JSON format.""" + from collections import OrderedDict + available_datasets = OrderedDict() + + available_datasets['antimicrobial-resistance'] = [] + available_datasets['ariba'] = [] + available_datasets['minmer'] = {'sketches': [], 'last_update': None} + available_datasets['plasmid'] = {'sketches': None, 'blastdb': None, 'last_update': None} + + # Antimicrobial Resistance + if os.path.exists(f'{outdir}/antimicrobial-resistance'): + for db in sorted(os.listdir(f'{outdir}/antimicrobial-resistance')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['antimicrobial-resistance'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/antimicrobial-resistance/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Ariba + if os.path.exists(f'{outdir}/ariba'): + for db in sorted(os.listdir(f'{outdir}/ariba')): + if db.endswith(".tar.gz"): + if db != 'EMPTY.tar.gz': + name = db.replace(".tar.gz", "") + available_datasets['ariba'].append({ + 'name': db, + 'last_update': execute( + f'head -n 1 {outdir}/ariba/{name}-updated.txt', capture=True + ).rstrip() + }) + + # Minmers + if os.path.exists(f'{outdir}/minmer/minmer-updated.txt'): + available_datasets['minmer'] = { + 'sketches': [], + 'last_update': execute( + f'head -n 1 {outdir}/minmer/minmer-updated.txt', capture=True + ).rstrip() + } + for sketch in sorted(os.listdir(f'{outdir}/minmer')): + if sketch != 'minmer-updated.txt': + available_datasets['minmer']['sketches'].append(sketch) + + # PLSDB (plasmids) + if os.path.exists(f'{outdir}/plasmid/plsdb-updated.txt'): + available_datasets['plasmid'] = { + 'sketches': 'plsdb.msh', + 'blastdb': 'plsdb.fna', + 'last_update': execute( + f'head -n 1 {outdir}/plasmid/plsdb-updated.txt', capture=True + ).rstrip() + } + + # Organism Specific + if os.path.exists(f'{outdir}/species-specific'): + available_datasets['species-specific'] = OrderedDict() + for species in sorted(os.listdir(f'{outdir}/species-specific')): + new_species = OrderedDict() + species_dir = f'{outdir}/species-specific/{species}' + + minmer = f'{species_dir}/minmer' + new_species['minmer'] = {'mash': None, 'last_updated': None} + if os.path.exists(f'{minmer}/refseq-genomes.msh'): + new_species['minmer'] = { + 'mash': f'species-specific/{species}/minmer/refseq-genomes.msh', + 'last_updated': execute( + f'head -n 1 {minmer}/minmer-updated.txt', + capture=True + ).rstrip() + } + + prokka = f'{species_dir}/annotation' + new_species['annotation'] = { 'proteins': None, 'training_set': None, 'last_updated': None} + if os.path.exists(f'{prokka}/proteins.faa'): + new_species['annotation'] = { + 'proteins': f'species-specific/{species}/annotation/proteins.faa', + 'last_updated': execute( + f'head -n 1 {prokka}/proteins-updated.txt', + capture=True + ).rstrip() + } + + if training_set: + if not os.path.exists(prokka): + execute(f'mkdir -p {prokka}') + execute(f'cp {training_set} {prokka}/prodigal.tf') + new_species['annotation']['training_set'] = f'species-specific/{species}/annotation/prodigal.tf' + + new_species['genome_size'] = {'min': None, 'median': None, 'mean': None, 'max': None} + if os.path.exists(f'{prokka}/genome_size.json'): + with open(f'{prokka}/genome_size.json', 'r') as gs_fh: + json_data = json.load(gs_fh) + new_species['genome_size'] = json_data + + mlst = f'{species_dir}/mlst' + new_species['mlst'] = {} + if os.path.exists(f'{mlst}'): + for schema in sorted(os.listdir(f'{mlst}')): + if os.path.exists(f'{mlst}/{schema}/{schema}-ariba.tar.gz'): + new_species['mlst'][schema] = { + 'ariba': f'species-specific/{species}/mlst/{schema}/{schema}-ariba.tar.gz', + 'blast': f'species-specific/{species}/mlst/{schema}/{schema}-blastdb.tar.gz', + 'last_updated': execute( + f'head -n 1 {mlst}/{schema}/mlst-updated.txt', capture=True + ).rstrip() + } + + optionals = sorted([ + 'reference-genomes', 'mapping-sequences', 'blast' + ]) + new_species['optional'] = OrderedDict() + for optional in optionals: + # These are optional directories users can add data to + optional_dir = f'species-specific/{species}/optional/{optional}' + if not os.path.exists(optional_dir): + execute(f'mkdir -p {optional_dir}', directory=outdir) + if optional == 'blast': + new_species['optional'][optional] = [ + f'{optional_dir}/genes', + f'{optional_dir}/primers', + f'{optional_dir}/proteins', + ] + for blast_dir in new_species['optional'][optional]: + execute(f'mkdir -p {blast_dir}', directory=outdir) + else: + new_species['optional'][optional] = f'{optional_dir}' + + available_datasets['species-specific'][species] = new_species + + with open(f'{outdir}/summary.json', 'w') as json_handle: + logging.info(f'Writing summary of available datasets') + json.dump(available_datasets, json_handle, indent=4) + logging.debug(json.dumps(available_datasets, indent=4)) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, error_ok=False): + """A simple wrapper around executor.""" + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + except ExternalCommandFailed as e: + if "No downloads matched your filter" in e.error_message and error_ok: + return None + else: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + import textwrap + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Setup public datasets for Bactopia' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} + {PROGRAM} --ariba 'vfdb_core' + {PROGRAM} --species 'Staphylococcus aureus' --include_genus + ''') + ) + + parser.add_argument( + 'pubmlst', metavar="PUBMLST", type=str, + help='Bactopia config file with PubMLST schema mappings for Ariba.' + ) + + parser.add_argument( + '--outdir', metavar="STR", type=str, default="./datasets", + help='Directory to write output. (Default ./datasets)' + ) + + group1 = parser.add_argument_group('Ariba Reference Datasets') + group1.add_argument( + '--skip_ariba', action='store_true', + help=('Skip setup of Ariba datasets') + ) + group1.add_argument( + '--ariba', metavar="STR", type=str, default='vfdb_core,card', + help=('Comma separated list of Ariba datasets to download and setup. ' + 'Available datasets include: argannot, card, ncbi, megares, ' + 'plasmidfinder, resfinder, srst2_argannot, vfdb_core, vfdb_full, ' + 'virulencefinder (Default: "vfdb_core,card") Use --available_datasets ' + 'to see the full list.') + ) + + group2 = parser.add_argument_group('Bacterial Species') + group2.add_argument( + '--species', metavar="STR", type=str, + help=('Download available MLST schemas and completed genomes for ' + 'a given species or a list of species in a text file.') + ) + group2.add_argument( + '--skip_mlst', action='store_true', + help=('Skip setup of MLST schemas for each species') + ) + + group3 = parser.add_argument_group('Custom Prokka Protein FASTA') + group3.add_argument( + '--skip_prokka', action='store_true', + help=('Skip creation of a Prokka formatted fasta for each species') + ) + group3.add_argument( + '--include_genus', action='store_true', + help=('Include all genus members in the Prokka proteins FASTA') + ) + group3.add_argument( + '--assembly_level', default='complete', type=str, + choices=['all', 'complete', 'chromosome', 'scaffold', 'contig'], + help=('Assembly levels of genomes to download (Default: complete).') + ) + group3.add_argument( + '--limit', metavar="INT", type=int, default=1000, + help=('If available completed genomes exceeds a given limit, a random ' + 'subsample will be taken. (Default 1000)') + ) + group3.add_argument( + '--accessions', metavar="STR", type=str, + help=('A list of RefSeq accessions to download.') + ) + group3.add_argument( + '--identity', metavar="FLOAT", type=float, default=0.9, + help=('CD-HIT (-c) sequence identity threshold. (Default: 0.9)') + ) + group3.add_argument( + '--overlap', metavar="FLOAT", type=float, default=0.8, + help=('CD-HIT (-s) length difference cutoff. (Default: 0.8)') + ) + group3.add_argument( + '--max_memory', metavar="INT", type=int, default=0, + help=('CD-HIT (-M) memory limit (in MB). (Default: unlimited') + ) + group3.add_argument( + '--fast_cluster', action='store_true', + help=("Use CD-HIT's (-g 0) fast clustering algorithm, instead of the " + "accurate but slow algorithm.") + ) + + + group4 = parser.add_argument_group('Minmer Datasets') + group4.add_argument( + '--skip_minmer', action='store_true', + help='Skip download of pre-computed minmer datasets (mash, sourmash)' + ) + + group5 = parser.add_argument_group('PLSDB (Plasmid) BLAST/Sketch') + group5.add_argument( + '--skip_plsdb', action='store_true', + help='Skip download of pre-computed PLSDB datbases (blast, mash)' + ) + + group6 = parser.add_argument_group('Antimicrobial Resistance Datasets') + group6.add_argument( + '--skip_amr', action='store_true', + help='Skip download of antimicrobial resistance databases (e.g. AMRFinder+)' + ) + + group7 = parser.add_argument_group('Optional User Provided Datasets') + group7.add_argument( + '--prodigal_tf', metavar="STR", type=str, + help=("A pre-built Prodigal training file to add to the species " + "annotation folder. Requires a single species (--species) and " + "will replace existing training files.") + ) + + group7.add_argument( + '--reference', metavar="STR", type=str, + help=("A reference genome (FASTA/GenBank (preferred)) file or directory " + "to be added to the optional folder for variant calling. Requires " + "a single species (--species).") + ) + group7.add_argument( + '--mapping', metavar="STR", type=str, + help=("A reference sequence (FASTA) file or directory to be added to the " + "optional folder for mapping. Requires a single species (--species).") + ) + group7.add_argument( + '--genes', metavar="STR", type=str, + help=("A gene sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--proteins', metavar="STR", type=str, + help=("A protein sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--primers', metavar="STR", type=str, + help=("A primer sequence (FASTA) file or directory to be added to the " + "optional folder for BLAST. Requires a single species (--species).") + ) + group7.add_argument( + '--force_optional', action='store_true', + help='Overwrite any existing files in the optional folders' + ) + + group8 = parser.add_argument_group('Custom Options') + group8.add_argument( + '--cpus', metavar="INT", type=int, default=1, + help=('Number of cpus to use. (Default: 1)') + ) + group8.add_argument('--clear_cache', action='store_true', + help='Remove any existing cache.') + + group8.add_argument('--force', action='store_true', + help='Forcibly overwrite existing datasets.') + group8.add_argument('--force_ariba', action='store_true', + help='Forcibly overwrite existing Ariba datasets.') + group8.add_argument('--force_mlst', action='store_true', + help='Forcibly overwrite existing MLST datasets.') + group8.add_argument('--force_prokka', action='store_true', + help='Forcibly overwrite existing Prokka datasets.') + group8.add_argument('--force_minmer', action='store_true', + help='Forcibly overwrite existing minmer datasets.') + group8.add_argument('--force_plsdb', action='store_true', + help='Forcibly overwrite existing PLSDB datasets.') + group8.add_argument('--force_amr', action='store_true', + help='Forcibly overwrite existing antimicrobial resistance datasets.') + group8.add_argument( + '--keep_files', action='store_true', + help=('Keep all downloaded and intermediate files.') + ) + group8.add_argument( + '--available_datasets', action='store_true', + help=('List Ariba reference datasets and MLST schemas ' + 'available for setup.') + ) + + group8.add_argument('--depends', action='store_true', + help='Verify dependencies are installed.') + + group9 = parser.add_argument_group('Adjust Verbosity') + group9.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + group9.add_argument('--verbose', action='store_true', + help='Print debug related text.') + group9.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + if args.depends: + validate_requirements() + sys.exit(0) + else: + validate_requirements() + + ARIBA, PUBMLST = get_available_datasets(args.pubmlst, args.clear_cache) + if args.available_datasets: + available_datasets(ARIBA, PUBMLST) + + species_key = None + num_species = 0 + if args.species: + species_key = validate_species(args.species) + num_species = len(species_key.keys()) + + if args.include_genus: + if not num_species: + logging.error(f'Species (--species) not given, ignoring --include_genus') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --include_genus') + sys.exit(1) + + if args.prodigal_tf: + if not os.path.exists(args.prodigal_tf): + logging.error(f'Unable to locate {args.prodigal_tf}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --prodigal_tf') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --prodigal_tf') + sys.exit(1) + + if args.accessions: + if not os.path.exists(args.accessions): + logging.error(f'Unable to locate {args.accessions}, please verify path') + sys.exit(1) + elif not num_species: + logging.error(f'A single species (--species) must be given to use --accessions') + sys.exit(1) + elif num_species > 1: + logging.error(f'Only a single species (given {num_species}) can be used with --accessions') + sys.exit(1) + + if not args.skip_ariba: + if args.ariba: + logging.info('Setting up Ariba datasets') + setup_ariba( + args.ariba, ARIBA, args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_ariba) + ) + else: + logging.info('No requests for an Ariba dataset, skipping') + else: + logging.info('Skipping Ariba dataset step') + + if not args.skip_minmer: + logging.info('Setting up pre-computed Genbank/Refseq minmer datasets') + setup_minmer(args.outdir, force=(args.force or args.force_minmer)) + else: + logging.info('Skipping minmer dataset step') + + if not args.skip_plsdb: + logging.info('Setting up pre-computed PLSDB (plasmids) datasets') + setup_plsdb(args.outdir, keep_files=args.keep_files, + force=(args.force or args.force_plsdb)) + else: + logging.info('Skipping PLSDB (plasmids) dataset step') + + if not args.skip_amr: + logging.info('Setting up antimicrobial resistance datasets') + setup_amr(args.outdir, force=(args.force or args.force_amr)) + else: + logging.info('Skipping antimicrobial resistance dataset step') + + # Organism datasets + if args.species: + species_dir = f'{args.outdir}/species-specific' + + if not args.skip_mlst: + logging.info('Setting up MLST datasets') + setup_mlst(args.species, PUBMLST, species_dir, + force=(args.force or args.force_mlst), species_key=species_key) + + if not args.skip_prokka: + logging.info('Setting up custom Prokka proteins') + setup_prokka( + args.species, PUBMLST, species_dir, cpus=args.cpus, + include_genus=args.include_genus, limit=args.limit, + user_accessions=args.accessions, identity=args.identity, + overlap=args.overlap, max_memory=args.max_memory, + fast_cluster=args.fast_cluster, keep_files=args.keep_files, + force=(args.force or args.force_prokka), species_key=species_key, + assembly_level=args.assembly_level + ) + else: + logging.info('Skipping custom Prokka dataset step') + else: + logging.info('No requests for an species, skipping') + + create_summary(args.outdir, training_set=args.prodigal_tf) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-prepare.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-prepare.py new file mode 100755 index 000000000..9efcc8fab --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-prepare.py @@ -0,0 +1,272 @@ +#! /usr/bin/env python3 +""" +usage: bactopia prepare [-h] [-f STR] [-a STR] [--fastq_seperator STR] + [--fastq_pattern STR] [--pe1_pattern STR] + [--pe2_pattern STR] [--assembly_pattern STR] [-r] + [--long_reads] [--merge] [--prefix STR] [--version] + STR + +bactopia prepare - Read a directory and prepare a FOFN of +FASTQs/FASTAs + +positional arguments: + STR Directory where FASTQ files are stored + +optional arguments: + -h, --help show this help message and exit + -f STR, --fastq_ext STR + Extension of the FASTQs. Default: .fastq.gz + -a STR, --assembly_ext STR + Extension of the FASTA assemblies. Default: .fna.gz + --fastq_seperator STR + Split FASTQ name on the last occurrence of the + separator. Default: _ + --fastq_pattern STR Glob pattern to match FASTQs. Default: *.fastq.gz + --pe1_pattern STR Designates difference first set of paired-end reads. + Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a) + --pe2_pattern STR Designates difference second set of paired-end reads. + Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b) + --assembly_pattern STR + Glob pattern to match assembly FASTAs. Default: + *.fna.gz + -r, --recursive Directories will be traversed recursively + --long_reads Single-end reads should be treated as long reads + --merge Flag samples with multiple read sets to be merged by + Bactopia + --prefix STR Replace the absolute path with a given string. + Default: Use absolute path + --version show program's version number and exit +""" +VERSION = "1.6.0" +PROGRAM = "bactopia prepare" + + +def search_path(path, pattern, recursive=False): + from pathlib import Path + if recursive: + return Path(path).rglob(pattern) + else: + return Path(path).glob(pattern) + + +def get_path(fastq, abspath, prefix): + fastq_path = str(fastq.absolute()) + if prefix: + return fastq_path.replace(abspath, prefix.rstrip("/")) + return fastq_path + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import glob + import os + import re + import sys + + parser = ap.ArgumentParser( + prog='bactopia prepare', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Read a directory and prepare a FOFN of FASTQs/FASTAs' + ) + ) + parser.add_argument('path', metavar="STR", type=str, + help='Directory where FASTQ files are stored') + parser.add_argument( + '-f', '--fastq_ext', metavar='STR', type=str, + default=".fastq.gz", + help='Extension of the FASTQs. Default: .fastq.gz' + ) + parser.add_argument( + '-a', '--assembly_ext', metavar='STR', type=str, + default=".fna.gz", + help='Extension of the FASTA assemblies. Default: .fna.gz' + ) + parser.add_argument( + '--fastq_seperator', metavar='STR', type=str, + default="_", + help='Split FASTQ name on the last occurrence of the separator. Default: _' + ) + + parser.add_argument( + '--fastq_pattern', metavar='STR', type=str, + default="*.fastq.gz", + help='Glob pattern to match FASTQs. Default: *.fastq.gz' + ) + + parser.add_argument( + '--pe1_pattern', metavar='STR', type=str, default="[Aa]|[Rr]1", + help='Designates difference first set of paired-end reads. Default: ([Aa]|[Rr]1) (R1, r1, 1, A, a)' + ) + + parser.add_argument( + '--pe2_pattern', metavar='STR', type=str, default="[Bb]|[Rr]2", + help='Designates difference second set of paired-end reads. Default: ([Bb]|[Rr]2) (R2, r2, 2, AB b)' + ) + + parser.add_argument( + '--assembly_pattern', metavar='STR', type=str, + default="*.fna.gz", + help='Glob pattern to match assembly FASTAs. Default: *.fna.gz' + ) + + parser.add_argument( + '-r', '--recursive', action='store_true', + help='Directories will be traversed recursively' + ) + + parser.add_argument( + '--long_reads', action='store_true', + help='Single-end reads should be treated as long reads' + ) + + parser.add_argument( + '--merge', action='store_true', + help='Flag samples with multiple read sets to be merged by Bactopia' + ) + + parser.add_argument( + '--prefix', metavar='STR', type=str, + help='Replace the absolute path with a given string. Default: Use absolute path' + ) + + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + abspath = os.path.abspath(args.path) + SAMPLES = {} + + # Match FASTQS + for fastq in search_path(abspath, args.fastq_pattern, recursive=args.recursive): + fastq_name = fastq.name.replace(args.fastq_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + split_vals = fastq_name.rsplit(args.fastq_seperator, 1) + sample_name = split_vals[0] + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': {'r1': [], 'r2': []}, 'se': [], 'assembly': []} + + if len(split_vals) == 1: + # single-end + SAMPLES[sample_name]['se'].append(get_path(fastq, abspath, args.prefix)) + else: + # paired-end + pe1 = re.compile(args.pe1_pattern) + pe2 = re.compile(args.pe2_pattern) + if pe1.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r1'].append(get_path(fastq, abspath, args.prefix)) + elif pe2.match(split_vals[1]): + SAMPLES[sample_name]['pe']['r2'].append(get_path(fastq, abspath, args.prefix)) + else: + print(f'ERROR: Could not determine read set for "{fastq_name}".', file=sys.stderr) + print(f'ERROR: Found {split_vals[1]} expected (R1: {args.pe1_pattern} or R2: {args.pe2_pattern})', file=sys.stderr) + print(f'ERROR: Please use --pe1_pattern and --pe2_pattern to correct and try again.', file=sys.stderr) + sys.exit(1) + + # Match assemblies + for assembly in glob.glob(f'{abspath}/**/*{args.assembly_pattern}', recursive=args.recursive): + sample_name = os.path.basename(assembly).replace(args.assembly_ext, "") + # Split the fastq file name on separator + # Example MY_FASTQ_R1.rsplit('_', 1) becomes ['MY_FASTQ', 'R1'] (PE) + # Example MY_FASTQ.rsplit('_', 1) becomes ['MY_FASTQ'] (SE) + if sample_name not in SAMPLES: + SAMPLES[sample_name] = {'pe': [], 'se': [], 'assembly': []} + SAMPLES[sample_name]['assembly'].append(get_path(assembly, abspath, args.prefix)) + + FOFN = [] + for sample, vals in sorted(SAMPLES.items()): + r1_reads = vals['pe']['r1'] + r2_reads = vals['pe']['r2'] + se_reads = vals['se'] + assembly = vals['assembly'] + errors = [] + is_single_end = False + multiple_read_sets = False + pe_count = len(r1_reads) + len(r2_reads) + + # Validate everything + if len(assembly) > 1: + # Can't have multiple assemblies for the same sample + errors.append(f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.') + elif len(assembly) == 1 and (pe_count or len(se_reads)): + # Can't have an assembly and reads for a sample + errors.append(f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.') + + if len(r1_reads) != len(r2_reads): + # PE reads must be a pair + errors.append(f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.') + elif pe_count > 2: + # PE reads must be a pair + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.') + + if args.long_reads: + if not pe_count and len(se_reads): + # Long reads must also have short PE reads + print(f'WARNING: "{sample}" does not have paired-end reads, treating as single-end short reads, please verify.', file=sys.stderr) + is_single_end = True + else: + if len(se_reads) > 1: + # Can't have multiple SE reads + if args.merge: + multiple_read_sets = True + else: + errors.append(f'ERROR: "{sample}" has more than two single-end FASTQs, please check.') + elif pe_count and len(se_reads): + # Can't have SE and PE reads unless long reads + errors.append(f'ERROR: "{sample}" has paired and single-end FASTQs, please check.') + + if errors: + print('\n'.join(errors), file=sys.stderr) + else: + runtype = '' + r1 = '' + r2 = '' + extra = '' + + if assembly: + runtype = 'assembly' + extra = assembly[0] + + if pe_count: + if multiple_read_sets: + if args.long_reads: + runtype = 'hybrid-merge-pe' + else: + runtype = 'merge-pe' + r1 = ','.join(sorted(r1_reads)) + r2 = ','.join(sorted(r2_reads)) + else: + runtype = 'paired-end' + r1 = r1_reads[0] + r2 = r2_reads[0] + + if se_reads: + if args.long_reads and not is_single_end: + runtype = 'hybrid' + extra = se_reads[0] + else: + if multiple_read_sets: + runtype = 'merge-se' + r1 = ','.join(se_reads) + else: + runtype = 'single-end' + r1 = se_reads[0] + + FOFN.append([sample, runtype, r1, r2, extra]) + + if FOFN: + print('sample\truntype\tr1\tr2\textra') + for line in FOFN: + print('\t'.join(line)) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-pull.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-pull.py new file mode 100755 index 000000000..7f62ebd5d --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-pull.py @@ -0,0 +1,223 @@ +#! /usr/bin/env python3 +""" +usage: bactopia pull [-h] [--envname STR] [--singularity_cache STR] + [--registry STR] [--max_retry INT] [--include_tools] + [--default] [--is_bactopia] [--force] [--verbose] + [--silent] [--version] + STR + +bactopia pull - Build Singularity images used by Bactopia + +positional arguments: + STR Directory containing Dockerfiles. + +optional arguments: + -h, --help show this help message and exit + --envname STR Build Singularity images with the given name + --singularity_cache STR + Directory where Singularity images will be stored. + --registry STR Docker registry to pull containers from + --max_retry INT Maximum times to attempt creating Conda environment. + (Default: 5) + --include_tools Singularity images for Bactopia Tools will also be + built. + --default Builds Singularity images to the default Bactopia + location. + --is_bactopia This is an automated call by bactopia not a user + --force Force overwrite of existing Conda environments. + --verbose Print debug related text. + --silent Only critical errors will be printed. + --version show program's version number and exit +""" +import logging +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia pull" +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None, allow_fail=False): + """A simple wrapper around executor.""" + from executor import ExternalCommand, ExternalCommandFailed + try: + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + return True + except ExternalCommandFailed as e: + if allow_fail: + logging.log(STDERR, e) + sys.exit(e.returncode) + else: + return None + + +def get_docker_prefix(registry): + """Return the proper prefix based on registry.""" + if registry == "quay": + return 'quay.io' + elif registry == "github": + return 'ghcr.io' + else: + return '' + + +def check_needs_build(image, force=False, is_bactopia=False): + """Check if a new image needs to be built.""" + if os.path.exists(image) and not force: + if not is_bactopia: + logging.info(f'Existing image ({image}) found, skipping unless --force is used') + return False + return True + + +def build_singularity_image(image, docker, max_retry=5, force=False, is_bactopia=False): + """Build Conda env, with chance to retry.""" + force = '--force' if force else '' + if is_bactopia: + force = '--force' + retry = 0 + allow_fail = False + success = False + while not success: + result = execute(f'singularity build {force} {image} {docker}', allow_fail=allow_fail) + if not result: + if retry > max_retry: + allow_fail = True + retry += 1 + logging.log(STDERR, "Error creating image, retrying after short sleep.") + time.sleep(30 * retry) + else: + success = True + return success + + +if __name__ == '__main__': + import argparse as ap + import glob + import sys + import time + from pathlib import Path + + parser = ap.ArgumentParser( + prog='bactopia pull', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Build Singularity images used by Bactopia' + ) + ) + + parser.add_argument('docker', metavar="STR", type=str, + help='Directory containing Dockerfiles.') + parser.add_argument('--envname', metavar='STR', type=str, + help='Build Singularity images with the given name') + parser.add_argument('--singularity_cache', metavar='STR', type=str, default="~/.bactopia/singularity", + help='Directory where Singularity images will be stored.') + parser.add_argument('--registry', metavar='STR', type=str, default="dockerhub", + help='Docker registry to pull containers from') + parser.add_argument('--max_retry', metavar='INT', type=int, default=5, + help='Maximum times to attempt creating Conda environment. (Default: 5)') + parser.add_argument('--include_tools', action='store_true', + help='Singularity images for Bactopia Tools will also be built.') + parser.add_argument('--default', action='store_true', + help='Builds Singularity images to the default Bactopia location.') + parser.add_argument('--is_bactopia', action='store_true', + help='This is an automated call by bactopia not a user') + parser.add_argument('--force', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args, unknown = parser.parse_known_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + + env_path = os.path.abspath(os.path.expanduser(args.docker)) + install_path = os.path.abspath(os.path.expanduser(args.singularity_cache)) + finish_file = f'{install_path}/{args.registry}-images-built-{VERSION}.txt' + if os.path.exists(finish_file): + print(f'Found Singularity images in {install_path}, if a complete rebuild is needed please use --force') + + if not os.path.exists(install_path): + logging.info(f'Creating {install_path} to save images to') + execute(f'mkdir -p {install_path}') + + registry = get_docker_prefix(args.registry) + docker_prefix = f'docker://{registry}/bactopia' if registry else f'docker://bactopia' + env_files = sorted(glob.glob(f'{env_path}/linux/*.yml')) + if env_files: + for i, env_file in enumerate(env_files): + envname = os.path.basename(env_file).replace(".yml", "") + img_name = f"{install_path}/{registry}-bactopia-{envname}-{VERSION}.img" if registry else f"{install_path}/bactopia-{envname}-{VERSION}.img" + pull_name = f"{docker_prefix}/{envname}:{VERSION}" + build = True + if args.envname: + if not args.envname == envname: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {envname} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) + execute(f'touch {finish_file}') + else: + logging.error(f'Unable to find *.Dockerfiles in {env_path}, please verify') + sys.exit(1) + + if args.include_tools: + tool_path = os.path.abspath(args.conda_envs).replace('conda', 'tools') + tools = sorted(glob.glob(f'{tool_path}/*/')) + for i, tool in enumerate(tools): + tool = os.path.basename(os.path.dirname(tool)) + if not tool.startswith('.'): + img_name = f"{install_path}/{registry}-bactopia-tools-{tool}-{VERSION}.img" if registry else f"{install_path}/bactopia-tools-{tool}-{VERSION}.img" + pull_name = f"{docker_prefix}/tools-{tool}:{VERSION}" + build = True + if args.envname: + if not args.envname == tool: + build = False + + if build: + if check_needs_build(img_name, force=args.force, is_bactopia=args.is_bactopia): + logging.info(f'Found {tool} ({i+1} of {len(env_files)}), begin build to {img_name}') + + build_singularity_image(img_name, pull_name, max_retry=args.max_retry, force=args.force, + is_bactopia=args.is_bactopia) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-search.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-search.py new file mode 100755 index 000000000..e222c9908 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-search.py @@ -0,0 +1,385 @@ +#! /usr/bin/env python3 +""" +Query Taxon ID or Study accession against ENA and return a list of WGS results. + +usage: bactopia search [-h] [--exact_taxon] [--outdir OUTPUT_DIRECTORY] + [--prefix PREFIX] [--limit INT] [--version] + STR + +bactopia search - Search ENA for associated WGS samples + +positional arguments: + STR Taxon ID or Study accession + +optional arguments: + -h, --help show this help message and exit + --exact_taxon Exclude Taxon ID descendents. + --outdir OUTPUT_DIRECTORY + Directory to write output. (Default: .) + --prefix PREFIX Prefix to use for output file names. (Default: ena) + --limit INT Maximum number of results to return. (Default: + 1000000) + --version show program's version number and exit + +example usage: + bactopia search PRJNA480016 --limit 20 + bactopia search 1280 --exact_taxon --limit 20' + bactopia search "staphylococcus aureus" --limit 20 + +""" +import os +import sys +VERSION = "1.6.0" +PROGRAM = "bactopia search" +ENA_URL = ('https://www.ebi.ac.uk/ena/portal/api/search') +FIELDS = [ + 'study_accession', 'secondary_study_accession', 'sample_accession', + 'secondary_sample_accession', 'experiment_accession', 'run_accession', + 'submission_accession', 'tax_id', 'scientific_name', + 'instrument_platform', 'instrument_model', 'library_name', + 'library_layout', 'nominal_length', 'library_strategy', + 'library_source', 'library_selection', 'read_count', + 'base_count', 'center_name', 'first_public', 'last_updated', + 'experiment_title', 'study_title', 'study_alias', 'experiment_alias', + 'run_alias', 'fastq_bytes', 'fastq_md5', 'fastq_ftp', 'fastq_aspera', + 'fastq_galaxy', 'submitted_bytes', 'submitted_md5', 'submitted_ftp', + 'submitted_aspera', 'submitted_galaxy', 'submitted_format', + 'sra_bytes', 'sra_md5', 'sra_ftp', 'sra_aspera', 'sra_galaxy', + 'cram_index_ftp', 'cram_index_aspera', 'cram_index_galaxy', + 'sample_alias', 'broker_name', 'sample_title', 'first_created' +] + + +def ena_search(query, is_accession, limit=1000000): + """USE ENA's API to retreieve the latest results.""" + import requests + import time + + # ENA browser info: http://www.ebi.ac.uk/ena/about/browser + query_original = query + data = { + 'dataPortal': 'ena', + 'dccDataOnly': 'false', + 'download': 'false', + 'result': 'read_run', + 'format': 'tsv', + 'limit': limit, + 'fields': ",".join(FIELDS) + } + if is_accession: + data['includeAccessions'] = query + else: + data['query'] = ( + f'"{query} AND library_source=GENOMIC AND ' + '(library_strategy=OTHER OR library_strategy=WGS OR ' + 'library_strategy=WGA) AND (library_selection=MNase OR ' + 'library_selection=RANDOM OR library_selection=unspecified OR ' + 'library_selection="size fractionation")"' + ) + + headers = {'accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded'} + response = requests.post(ENA_URL, headers=headers, data=data) + time.sleep(1) + if not response.text: + print(f'WARNING: {query_original} did not return any results from ENA.', file=sys.stderr) + return [[], []] + else: + results = response.text.rstrip().split('\n') + return [results[0], results[1:]] + + +def parse_accessions(results, min_read_length=None, min_base_count=None): + """Parse Illumina experiment accessions from the ENA results.""" + accessions = [] + filtered = {'min_base_count': 0, 'min_read_length': 0, 'technical': 0, 'filtered': []} + for line in results: + if line.startswith(FIELDS[0]): + continue + else: + col_vals = line.split('\t') + if len(col_vals) == len(FIELDS): + c = dict(zip(FIELDS, col_vals)) + if c['instrument_platform'] == "ILLUMINA": + passes = True + reason = [] + if not c['fastq_bytes']: + passes = False + reason.append(f'Missing FASTQs') + filtered['technical'] += 1 + else: + if min_read_length: + total_fastqs = len(c['fastq_bytes'].rstrip(';').split(';')) + read_length = int(float(c['base_count']) / (float(c['read_count']) * total_fastqs)) + if read_length < min_read_length: + passes = False + reason.append(f'Failed mean read length ({read_length} bp) filter, expected > {min_read_length} bp') + filtered['min_read_length'] += 1 + + if min_base_count: + if float(c['base_count']) < min_base_count: + passes = False + reason.append(f'Failed base count ({c["base_count"]} bp) filter, expected > {min_base_count} bp') + filtered['min_base_count'] += 1 + + if passes: + accessions.append(c['experiment_accession']) + else: + filtered['filtered'].append({ + 'accession': c['experiment_accession'], + 'reason': ';'.join(reason) + }) + + return [list(set(accessions)), filtered] + + +def is_biosample(accession): + """Check if input accession is a BioSample.""" + import re + if re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', accession): + return True + return False + + +def chunks(l, n): + """ + Yield successive n-sized chunks from l. + https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top + """ + for i in range(0, len(l), n): + yield l[i:i + n] + + +def parse_query(q, accession_limit, exact_taxon=False): + """Return the query based on if Taxon ID or BioProject/Study accession.""" + import re + queries = [] + if os.path.exists(q): + with open(q, 'r') as handle: + for line in handle: + line = line.rstrip() + if line: + queries.append(line) + elif "," in q: + queries = q.split(',') + else: + queries.append(q) + + results = [] + accessions = [] + + for query in queries: + try: + taxon_id = int(query) + if exact_taxon: + results.append(['taxon', f'tax_eq({taxon_id})']) + else: + results.append(['taxon', f'tax_tree({taxon_id})']) + except ValueError: + # It is a accession or scientific name + # Test Accession + # Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers + if re.match(r'PRJ[E|D|N][A-Z][0-9]+|[E|D|S]RP[0-9]{6,}', query): + accessions.append(query) + elif re.match(r'SAM(E|D|N)[A-Z]?[0-9]+|(E|D|S)RS[0-9]{6,}', query): + results.append(['biosample', f'(sample_accession={query} OR secondary_sample_accession={query})']) + elif re.match(r'(E|D|S)RR[0-9]{6,}', query): + accessions.append(query) + else: + # Assuming it is a scientific name + results.append(['taxon', f'tax_name("{query}")']) + + # Split the accessions into set number + for chunk in chunks(accessions, accession_limit): + results.append(['accession', ','.join(chunk)]) + + return results + + +if __name__ == '__main__': + import argparse as ap + import datetime + import random + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia search', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Search ENA for associated WGS samples' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(f''' + example usage: + {PROGRAM} PRJNA480016 --limit 20 + {PROGRAM} 1280 --exact_taxon --limit 20' + {PROGRAM} "staphylococcus aureus" --limit 20 + {PROGRAM} SAMN01737350 + {PROGRAM} SRR578340 + {PROGRAM} SAMN01737350,SRR578340 + {PROGRAM} accessions.txt + ''') + ) + parser.add_argument('query', metavar="STR", type=str, + help=('Taxon ID or Study, BioSample, or Run accession (can also be comma ' + 'separated or a file of accessions)') + ) + parser.add_argument( + '--exact_taxon', action='store_true', help='Exclude Taxon ID descendents.' + ) + parser.add_argument( + '--outdir', metavar="OUTPUT_DIRECTORY", type=str, default=".", + help='Directory to write output. (Default: .)' + ) + parser.add_argument( + '--prefix', metavar="PREFIX", type=str, default="ena", + help='Prefix to use for output file names. (Default: ena)' + ) + parser.add_argument( + '--limit', metavar="INT", type=int, default=1000000, + help='Maximum number of results (per query) to return. (Default: 1000000)' + ) + parser.add_argument( + '--accession_limit', metavar="INT", type=int, default=5000, + help='Maximum number of accessions to query at once. (Default: 5000)' + ) + + parser.add_argument( + '--biosample_subset', metavar="INT", type=int, default=0, + help='If a BioSample has multiple Experiments, pick a random subset. (Default: Return All)' + ) + + parser.add_argument( + '--min_read_length', metavar="INT", type=int, + help='Filters samples based on minimum mean read length. (Default: No filter)' + ) + parser.add_argument( + '--min_base_count', metavar="INT", type=int, + help='Filters samples based on minimum basepair count. (Default: No filter)' + ) + parser.add_argument( + '--min_coverage', metavar="INT", type=int, + help='Filter samples based on minimum coverage (requires --genome_size)' + ) + parser.add_argument( + '--genome_size', metavar="INT", type=int, + help='Genome size to estimate coverage (requires --coverage)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + min_read_length = args.min_read_length + min_base_count = args.min_base_count + if not os.path.exists(args.outdir): + os.makedirs(args.outdir, exist_ok=True) + + if args.min_coverage and args.genome_size: + if args.min_base_count: + print("--min_base_count cannot be used with --coverage/--genome_size. Exiting...", + file=sys.stderr) + sys.exit(1) + else: + min_base_count = args.min_coverage * args.genome_size + elif args.min_coverage or args.genome_size: + print("--coverage and --genome_size must be used together. Exiting...", + file=sys.stderr) + sys.exit(1) + + if args.biosample_subset > 0: + if not is_biosample(args.query): + print("--biosample_subset requires a single BioSample. Input query: {args.query} is not a BioSample. Exiting...", + file=sys.stderr) + sys.exit(1) + + today = datetime.datetime.now().replace(microsecond=0).isoformat() + results = [] + result_header = None + accessions = [] + filtered = {'min_base_count':0, 'min_read_length':0, 'technical':0, 'filtered': {}} + summary = [] + queries = parse_query(args.query, args.accession_limit, exact_taxon=args.exact_taxon) + i = 1 + results_file = f'{args.outdir}/{args.prefix}-results.txt' + accessions_file = f'{args.outdir}/{args.prefix}-accessions.txt' + filtered_file = f'{args.outdir}/{args.prefix}-filtered.txt' + for query_type, query in queries: + is_accession = True if query_type == 'accession' else False + query_header, query_results = ena_search(query, is_accession, limit=args.limit) + results = list(set(results + query_results)) + if not result_header: + result_header = query_header + query_accessions, query_filtered = parse_accessions(query_results, min_read_length=min_read_length, + min_base_count=min_base_count) + if len(query_accessions): + WARNING_MESSAGE = None + if query_type == 'biosample' and args.biosample_subset > 0: + if len(query_accessions) > args.biosample_subset: + WARNING_MESSAGE = f'WARNING: Selected {args.biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}' + query_accessions = random.sample(query_accessions, args.biosample_subset) + accessions = list(set(accessions + query_accessions)) + filtered['min_base_count'] += query_filtered['min_base_count'] + filtered['min_read_length'] += query_filtered['min_read_length'] + filtered['technical'] += query_filtered['technical'] + for filtered_sample in query_filtered['filtered']: + filtered['filtered'][filtered_sample['accession']] = filtered_sample['reason'] + else: + if query_results: + WARNING_MESSAGE = f'WARNING: {query} did not return any Illumina results from ENA.' + else: + WARNING_MESSAGE = f'WARNING: {query} did not return any results from ENA.' + + # Create Summary + query_string = query + if query_type == 'accession': + total_accessions = len(query.split(',')) + if total_accessions > 5: + query_string = f"{total_accessions} accessions were queried" + else: + query_string = query + if len(queries) > 1: + summary.append(f'QUERY ({i} of {len(queries)}): {query_string}') + i += 1 + else: + summary.append(f'QUERY: {query_string}') + summary.append(f'DATE: {today}') + summary.append(f'LIMIT: {args.limit}') + summary.append(f'RESULTS: {len(query_results)} ({results_file})') + summary.append(f'ILLUMINA ACCESSIONS: {len(query_accessions)} ({accessions_file})') + + if WARNING_MESSAGE: + summary.append(f'\t{WARNING_MESSAGE}') + + if min_read_length or min_base_count: + summary.append(f'FILTERED ACCESSIONS: {len(filtered["filtered"])}') + if min_read_length: + summary.append(f'\tFAILED MIN READ LENGTH ({min_read_length} bp): {query_filtered["min_read_length"]}') + if min_base_count: + summary.append(f'\tFAILED MIN BASE COUNT ({min_base_count} bp): {query_filtered["min_base_count"]}') + else: + summary.append(f'FILTERED ACCESSIONS: no filters applied') + + summary.append(f'\tMISSING FASTQS: {filtered["technical"]}') + summary.append("") + + # Output the results + with open(results_file, 'w') as output_fh: + output_fh.write(f'{result_header}\n') + for result in results: + if result: + output_fh.write(f'{result}\n') + + with open(accessions_file, 'w') as output_fh: + for accession in accessions: + output_fh.write(f'{accession}\n') + + with open(filtered_file, 'w') as output_fh: + output_fh.write(f'accession\treason\n') + for accession, reason in filtered['filtered'].items(): + output_fh.write(f'{accession}\t{reason}\n') + + with open(f'{args.outdir}/{args.prefix}-summary.txt', 'w') as output_fh: + output_fh.write('\n'.join(summary)) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-summary.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-summary.py new file mode 100755 index 000000000..fabf925c2 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-summary.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +""" + + +""" +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia summary" +DESCRIPTION = 'Create a summary report for samples' + +def get_output_files(): + """Return a dictionary opf output files to include in the summary.""" + """ + ${SAMPLE_NAME}/ + ├── annotation + ├── antimicrobial_resistance + ├── ariba + ├── assembly + ├── blast + ├── kmers + ├── logs + ├── mapping + ├── minmers + ├── mlst + ├── quality-control + ├── variants + └── ${SAMPLE_NAME}-genome-size.txt + """ + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + citations = validate_args(args.bactopia) + + for ref_type, entries in sorted(citations.items()): + print(f'# {ref_type} potentially used by Bactopia') + print('# ----------') + for entry in entries: + print(f'## {entry["name"]}') + print(textwrap.fill(entry['citation'], width=100)) + print() diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-tools.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-tools.py new file mode 100755 index 000000000..4f3bdcfe0 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-tools.py @@ -0,0 +1,202 @@ +#! /usr/bin/env python3 +""" +usage: bactopia tools [-h] [--bactopia STR] [--version] STR + +bactopia tools - A suite of comparative analyses for Bactopia outputs + +positional arguments: + STR Name of the Bactopia tool to execute. + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" +import logging +import os +import sys + +STDOUT = 11 +STDERR = 12 +logging.addLevelName(STDOUT, "STDOUT") +logging.addLevelName(STDERR, "STDERR") + +VERSION = "1.6.0" +PROGRAM = "bactopia tools" +DESCRIPTION = 'A suite of comparative analyses for Bactopia outputs' +AVAILABLE_TOOLS = { + 'eggnog': {'info': 'Functional annotation using orthologous groups', 'mac': True}, + 'fastani': {'info': 'Pairwise average nucleotide identity', 'mac': True}, + 'gtdb': {'info': 'Identify marker genes and assign taxonomic classifications', 'mac': False}, + 'ismapper': {'info': 'Identify positions of insertion sites', 'mac': True}, + 'mashtree': {'info': 'Trees based on Mash distances', 'mac': True}, + 'pirate': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'phyloflash': {'info': '16s assembly, alignment and tree', 'mac': True}, + 'roary': {'info': 'Pan-genome with optional core-genome tree', 'mac': True}, + 'summary': {'info': 'A report summarizing Bactopia project', 'mac': True}, +} + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def print_available_tools(): + """Print the available Bactopia Tools.""" + print(f"{PROGRAM} (v{VERSION}) - {DESCRIPTION}") + print("") + print(available_tools()) + + +def available_tools(): + """Return a string of available tools.""" + usage = ['Available Tools:'] + for k,v in sorted(AVAILABLE_TOOLS.items()): + usage.append(f' {k: <12}{v["info"]}') + return '\n'.join(usage) + + +def set_log_level(error, debug): + """Set the output log level.""" + return logging.ERROR if error else logging.DEBUG if debug else logging.INFO + + +def check_md5sum(expected_md5, current_md5): + """Compare the two md5 files to see if a rebuild is needed.""" + expected = None + current = None + with open(expected_md5, 'r') as f: + expected = f.readline().rstrip() + + with open(current_md5, 'r') as f: + current = f.readline().rstrip() + + return expected == current + + +def get_log_level(): + """Return logging level name.""" + return logging.getLevelName(logging.getLogger().getEffectiveLevel()) + + +def execute(cmd, directory=os.getcwd(), capture=False, stdout_file=None, + stderr_file=None): + """A simple wrapper around executor.""" + from executor import ExternalCommand + command = ExternalCommand( + cmd, directory=directory, capture=True, capture_stderr=True, + stdout_file=stdout_file, stderr_file=stderr_file + ) + + command.start() + if get_log_level() == 'DEBUG': + logging.log(STDOUT, command.decoded_stdout) + logging.log(STDERR, command.decoded_stderr) + + if capture: + return command.decoded_stdout + + +def validate_args(tool, bactopia_repo, skip_conda=False, force_rebuild=False): + import os + platform = get_platform() + + if tool not in AVAILABLE_TOOLS: + print(f'"{tool}" is not available.\n', file=sys.stderr) + print(available_tools(), file=sys.stderr) + sys.exit(1) + elif platform == 'mac' and not AVAILABLE_TOOLS[tool]['mac']: + print(f'"{tool}" is not available on Mac OSX.\n', file=sys.stderr) + sys.exit() + tool_nf = f'{bactopia_repo}/tools/{tool}/main.nf' + if not os.path.exists(tool_nf): + print(f"cannot access '{tool_nf}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-linux' + if platform == 'mac': + conda_prefix = f'{bactopia_repo}/tools/{tool}/environment-osx' + + if skip_conda: + return f"{tool_nf}" + else: + # Check if conda env exists + major, minor, patch = VERSION.split('.') + CONTAINER_VERSION = f'{major}.{minor}.x' + needs_build = False + condadir = f'{bactopia_repo}/conda/envs/tools-{tool}-{CONTAINER_VERSION}' + envbuilt_file = f'{condadir}/env-built.txt' + if os.path.exists(envbuilt_file) and not force_rebuild: + build_is_current = check_md5sum(f'{conda_prefix}.md5', envbuilt_file) + if build_is_current: + logging.info(f'Existing env ({condadir}) found, skipping unless --force_rebuild is used') + else: + needs_build = True + force_rebuild = True + logging.info(f'Existing env ({condadir}) is out of sync, it will be updated') + else: + needs_build = True + + if needs_build: + logging.info(f'Found {conda_prefix}.yml, begin build to {condadir}') + force = '--force' if force_rebuild else '' + execute(f'conda env create -f {conda_prefix}.yml --prefix {condadir} {force}') + execute(f'cp {conda_prefix}.md5 {envbuilt_file}') + + return f"{tool_nf} --condadir {condadir}" + + +if __name__ == '__main__': + import argparse as ap + import textwrap + + parser = ap.ArgumentParser( + prog='bactopia tools', + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter, + epilog=available_tools() + ) + parser.add_argument('tool', metavar="STR", type=str, + help='Name of the Bactopia tool to execute.') + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--force_rebuild', action='store_true', + help='Force overwrite of existing Conda environments.') + parser.add_argument('--skip_conda', action='store_true', + help='Skip all things conda related.') + parser.add_argument('--verbose', action='store_true', + help='Print debug related text.') + parser.add_argument('--silent', action='store_true', + help='Only critical errors will be printed.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + print_available_tools() + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + # Setup logs + FORMAT = '%(asctime)s:%(name)s:%(levelname)s - %(message)s' + logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S',) + logging.getLogger().setLevel(set_log_level(args.silent, args.verbose)) + print(validate_args( + args.tool, args.bactopia, + skip_conda=args.skip_conda, + force_rebuild=args.force_rebuild + )) diff --git a/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-versions.py b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-versions.py new file mode 100755 index 000000000..edc899880 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/helpers/bactopia-versions.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python3 +""" +usage: bactopia versions [-h] [--bactopia STR] [--version] STR + +bactopia versions - Prints the version of tools used by Bactopia + +optional arguments: + -h, --help show this help message and exit + --bactopia STR Directory where Bactopia repository is stored. + --version show program's version number and exit +""" + +import os +import sys + +VERSION = "1.6.0" +PROGRAM = "bactopia versions" +DESCRIPTION = 'Prints the version of tools used by Bactopia' + + +def get_platform(): + from sys import platform + if platform == "darwin": + return 'mac' + elif platform == "win32": + # Windows is not supported + print("Windows is not supported.", file=sys.stderr) + sys.exit(1) + return 'linux' + + +def validate_args(bactopia_repo): + import json + + bactopia_json = f'{bactopia_repo}/conda/bactopia-programs.json' + if not os.path.exists(bactopia_json): + print(f"cannot access '{bactopia_json}': No such file or directory\n", + file=sys.stderr) + print("Please make sure the correct path to Bactopia's repo is given.", + file=sys.stderr) + sys.exit(1) + else: + with open(bactopia_json, 'rt') as json_fh: + return json.load(json_fh) + + +def read_yaml(yaml): + versions = {} + with open(yaml, 'rt') as yaml_fh: + for line in yaml_fh: + line = line.strip() + if '=' in line: + program, version = line.replace('- ', '').split('=')[0:2] + versions[program] = version + return versions + + +if __name__ == '__main__': + import argparse as ap + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - {DESCRIPTION}' + ), + formatter_class=ap.RawDescriptionHelpFormatter + ) + parser.add_argument('--bactopia', metavar="STR", type=str, + help='Directory where Bactopia repository is stored.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + ostype = get_platform() + tools = validate_args(args.bactopia) + + conda_dir = f'{args.bactopia}/conda/{ostype}' + yamls = [f'{f.name}' for f in os.scandir(conda_dir) if f.name.endswith('.yml')] + versions = {} + for yaml in yamls: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions = {} + for tool, info in sorted(tools.items()): + yaml = info['conda']['yaml'] + if yaml not in versions: + if yaml.startswith("tools"): + versions[yaml] = read_yaml(f'{args.bactopia}/{yaml}') + else: + versions[yaml] = read_yaml(f'{conda_dir}/{yaml}') + + final_versions[tool.lower()] = { + 'name': tool, + 'version': versions[yaml][info['conda']['name']], + 'description': info['description'], + 'link': info['link'] + } + + print(f'name\tversion\tdescription\tlink') + for tool, cols in sorted(final_versions.items()): + print(f'{cols["name"]}\t{cols["version"]}\t{cols["description"]}\t{cols["link"]}') diff --git a/modules/variant_calling/call_variants_auto/bin/mask-consensus.py b/modules/variant_calling/call_variants_auto/bin/mask-consensus.py new file mode 100755 index 000000000..96658ee9a --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "1.6.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk)) diff --git a/modules/variant_calling/call_variants_auto/bin/merge-blast-json.py b/modules/variant_calling/call_variants_auto/bin/merge-blast-json.py new file mode 100755 index 000000000..eab3f861e --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/merge-blast-json.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 +""" +""" +import json + +PROGRAM = "merge-blast-json" +VERSION = "1.5.5" + +def read_json(json_file): + json_data = None + with open(json_file, 'rt') as json_fh: + json_data = json.load(json_fh) + return json_data + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Merge the BLAST results of multi-FASTA runs' + ) + ) + + parser.add_argument( + 'blast', metavar="FILE", type=str, + help='Directory containing JSON files' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + jsons = [f'{args.blast}/{f.name}' for f in os.scandir(args.blast) if f.name.endswith('.json')] + merged_json = None + for json_file in jsons: + json_data = read_json(json_file) + if merged_json: + # Bactopia uses parallel so only one fasta entry will ever be queried hence [0] + merged_json['BlastOutput2'].append(json_data['BlastOutput2'][0]) + else: + merged_json = json_data + + print(json.dumps(merged_json, indent=4)) diff --git a/modules/variant_calling/call_variants_auto/bin/mlst-blast.py b/modules/variant_calling/call_variants_auto/bin/mlst-blast.py new file mode 100755 index 000000000..4ee1984ef --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/mlst-blast.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python3 +""" +usage: mlst-blast.py [-h] [--cpu INT] [--quiet] [--compressed] + FASTA BLAST_DIR OUTPUT + +Determine MLST via BLAST + +positional arguments: + FASTA Input FASTA file to determine MLST + BLAST_DIR Directory where BLAST databases are stored + OUTPUT File to output results to + +optional arguments: + -h, --help show this help message and exit + --cpu INT Number of processors to use. + --quiet Do not output each command. + --compressed Input FASTA is Gzipped. +""" +PROGRAM = "mlst-blast" +VERSION = "1.6.0" + +def pipe_command(cmd_1, cmd_2, stdout=False, stderr=False, verbose=True, + shell=False): + """ + Execute a single command and return STDOUT and STDERR. + + If stdout or stderr are given, output will be written to given file name. + """ + import subprocess + if verbose: + print('{0} | {1}'.format(' '.join(cmd_1), ' '.join(cmd_2))) + stdout = open(stdout, 'w') if stdout else subprocess.PIPE + stderr = open(stderr, 'w') if stderr else subprocess.PIPE + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=stdout, stderr=stderr) + p1.stdout.close() + return p2.communicate() + + +def blast_alleles(input_file, blast, blastn_results, num_cpu, + verbose=True, compressed=False): + """Blast assembled contigs against MLST blast database.""" + from collections import OrderedDict + import glob + import json + from os.path import basename, splitext + + outfmt = "6 sseqid bitscore slen length nident mismatch pident evalue" + results = {} + + profile = {} + with open(f'{blast}/profile.txt', 'r') as profile_fh: + for line in profile_fh: + cols = line.rstrip().split('\t') + if line.startswith('ST'): + col_names = cols + else: + ST = None + alleles = [] + for i, name in enumerate(col_names): + if name == 'ST': + st = cols[i] + elif name != 'clonal_complex': + alleles.append(f'{name}.{cols[i]}') + profile[';'.join(sorted(alleles))] = st + + perfect_matches = [] + total_loci = 0 + for tfa in sorted(glob.glob(f'{blast}/*.tfa')): + total_loci += 1 + blastdb = splitext(tfa)[0] + allele = basename(blastdb) + print(allele) + blastn = pipe_command( + ['zcat' if compressed else 'cat', input_file], + ['blastn', '-db', blastdb, '-query', '-', '-outfmt', outfmt, + '-max_target_seqs', '10000', '-num_threads', num_cpu, + '-evalue', '10000', '-ungapped', '-dust', 'no', + '-word_size', '28'], verbose=verbose + ) + print("finished") + max_bitscore = 0 + top_hits = [] + not_first = False + for hit in blastn[0].decode("utf-8").split('\n'): + if hit: + cols = hit.split('\t') + if len(cols) > 1: + if float(cols[1]) > max_bitscore and not_first: + max_bitscore = float(cols[1]) + + if cols[2] == cols[3] and cols[2] == cols[4]: + # perfect match + cols.append('perfect_match') + top_hits.append(cols) + break + else: + if float(cols[1]) == max_bitscore: + cols.append( + 'has_snps' if cols[2] == cols[3] else 'partial' + ) + top_hits.append(cols) + else: + break + + top_hit = [] + if not top_hits: + # Did not return a hit + top_hit = ['0'] * 10 + top_hit[0] = '{0}.0'.format(allele) + elif len(top_hits) == 1: + # Had only a single top hit + top_hit = top_hits[0] + top_hit.append(1) + else: + min_allele = 1000000 + for hit in top_hits: + allele_number = int(hit[0].split('.')[1]) + if allele_number < min_allele: + # Give priority to the earliest allele on record + min_allele = allele_number + top_hit = hit + top_hit.append(len(top_hits)) + + results[allele] = OrderedDict(( + ('sseqid', top_hit[0]), + ('bitscore', top_hit[1]), + ('slen', top_hit[2]), + ('length', top_hit[3]), + ('nident', top_hit[4]), + ('mismatch', top_hit[5]), + ('pident', top_hit[6]), + ('evalue', top_hit[7]), + ('match_type', top_hit[8]), + ('shared_bitscore', top_hit[9]) + )) + if top_hit[8] == 'perfect_match': + perfect_matches.append(top_hit[0]) + + results['ST'] = OrderedDict(( + ('st', 'ND'), ('perfect_matches', len(perfect_matches)) + )) + if len(perfect_matches) == total_loci: + pattern = ';'.join(sorted(perfect_matches)) + if pattern in profile: + results['ST']['st'] = profile[pattern] + else: + results['ST']['st'] = 'Novel' + + with open(blastn_results, 'w') as json_fh: + json.dump(results, json_fh, indent=4, separators=(',', ': ')) + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog='mlst-blast.py', + conflict_handler='resolve', + description=f'{PROGRAM} (v{VERSION}) - Determine MLST via BLAST' + ) + parser.add_argument('fasta', metavar="FASTA", type=str, + help='Input FASTA file to determine MLST') + parser.add_argument('blast', metavar="BLAST_DIR", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('output', metavar="OUTPUT", type=str, + help='File to output results to') + parser.add_argument('--cpu', metavar='INT', type=int, default=1, + help='Number of processors to use.') + parser.add_argument('--quiet', action='store_true', + help='Do not output each command.') + parser.add_argument('--compressed', action='store_true', + help='Input FASTA is Gzipped.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + blast_alleles(args.fasta, args.blast, args.output, str(args.cpu), + verbose=not args.quiet, compressed=args.compressed) diff --git a/modules/variant_calling/call_variants_auto/bin/select-references.py b/modules/variant_calling/call_variants_auto/bin/select-references.py new file mode 100755 index 000000000..68af0d22b --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/select-references.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "select-references" +VERSION = "1.6.0" + + +def use_eutils(accession): + from Bio import Entrez + import time + import json + Entrez.email = "robert.petit@emory.edu" + Entrez.tool = "BactopiaSelectReferences" + accession = accession.split('.')[0] + handle = Entrez.esearch(db="assembly", term=accession, retmax="500") + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"])) + record = Entrez.read(handle, validate=False) + time.sleep(1) # Be kind to NCBI + + records = [] + excluded = set() + for assembly in record['DocumentSummarySet']["DocumentSummary"]: + if assembly["ExclFromRefSeq"]: + # PGAP can cause some Assemblies to eventually become excluded from RefSeq + # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/ + for reason in assembly["ExclFromRefSeq"]: + excluded.add(reason) + else: + records.append(assembly["AssemblyAccession"]) + + if excluded: + return [','.join(list(excluded)), True] + else: + return [sorted(records, reverse=True)[0], False] + + +def use_http(accession): + """ + Use urllib to get a link. + Example GCF_001548295: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/548/295/ + + Need to extract "GCF_001548295.1_ASM154829v1/" + """ + import re + import requests + accession, version = accession.split('.') + db, digits = accession.split("_") + digits_split = '/'.join(re.findall('.{1,3}', digits)) + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{digits_split}' + + r = requests.get(url) + current_accession = [] + if r.status_code == 200: + # Success + links = re.findall("href=[\"\'](.*?)[\"\']", r.text) + for link in links: + if link.startswith(accession): + t_db, t_version, t_extra = link.split("_", 2) + current_accession.append(f"{t_db}_{t_version}") + + if len(current_accession) == 1: + return [current_accession[0], False, None, None] + else: + if not len(current_accession): + return [current_accession, False, True, "Unable to parse and accession"] + else: + return [sorted(current_accession, reverse=True)[0], False, None, None] + + else: + return [accession, True, False, f"Accession does not exist at {url}, status code {r.status_code}"] + + +def check_assembly_version(accession): + try: + return use_eutils(accession) + except Exception as e: + if e.msg == "Bad Gateway": + print("NCBI servers are down, trying fallback.", file=sys.stderr) + current_accession, excluded, has_error, reason = use_http(accession) + if has_error: + print(f"Fallback failed. Reason: {reason}", file=sys.stderr) + sys.exit(42) + else: + return [current_accession, excluded] + else: + sys.exit(1) + + +if __name__ == '__main__': + import argparse as ap + from collections import defaultdict + import random + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Select references based on Mash distance' + ) + ) + + parser.add_argument( + 'mash', metavar="FILE", type=str, + help='Text file of Mash distances.' + ) + parser.add_argument( + 'total', metavar="INT", type=int, + help='Total number of references to download.' + ) + parser.add_argument( + '--random_tie_break', action='store_true', + help=( + 'Select random random genome on matching Mash distances. ' + '(Default: Earliest accession' + ) + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mash_distances = defaultdict(list) + with open(args.mash, 'rt') as mash_fh: + for line in mash_fh: + reference, distance = line.rstrip().split('\t') + mash_distances[distance].append(reference) + + remaining = args.total + for distance, references in sorted(mash_distances.items()): + if args.random_tie_break: + random.shuffle(references) + else: + references = sorted(references) + + for reference in references: + if reference: + print(use_http(reference)) + current_accession, excluded = check_assembly_version(reference) + if excluded: + print( + f'Skipping {reference}, it no longer in RefSeq. Reason: {current_accession}', + file=sys.stderr + ) + else: + difference = False if reference == current_accession else True + print(f'{reference}\t{distance}\t{current_accession}\t{difference}') + remaining -= 1 + if not remaining: + break + + if not remaining: + break diff --git a/modules/variant_calling/call_variants_auto/bin/split-coverages.py b/modules/variant_calling/call_variants_auto/bin/split-coverages.py new file mode 100755 index 000000000..a06a065f0 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/split-coverages.py @@ -0,0 +1,69 @@ +#! /usr/bin/env python3 +""" +""" +PROGRAM = "split-coverages" +VERSION = "1.6.0" + +if __name__ == '__main__': + import argparse as ap + import os + import sys + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Split a genomeCoverageBed output into separate files based on FASTA entry' + ) + ) + + parser.add_argument( + 'mapping', metavar="FILE", type=str, + help='Tab-delimited file used to map entry names to original fasta file.' + ) + parser.add_argument( + 'coverage', metavar="FILE", type=str, + help='genomeCoverageBed output file' + ) + parser.add_argument( + '--outdir', metavar="STR", type=str, default='coverages', + help='Directory to output split coverages into. (Default: coverages)' + ) + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + mappings = {} + with open(args.mapping, 'rt') as mapping_fh: + for line in mapping_fh: + fasta, entry = line.rstrip().split('\t') + mappings[entry] = fasta + + coverages = {} + with open(args.coverage, 'rt') as coverage_fh: + for line in coverage_fh: + entry, position, depth = line.rstrip().split('\t') + if mappings[entry] not in coverages: + coverages[mappings[entry]] = {} + + if entry not in coverages[mappings[entry]]: + coverages[mappings[entry]][entry] = [] + + coverages[mappings[entry]][entry].append(depth) + + if not os.path.exists(args.outdir): + os.makedirs(args.outdir) + + for fasta in coverages: + with open(f'{args.outdir}/{fasta}-coverage.txt', 'wt') as coverage_out: + total_entries = len(coverages[fasta]) + coverage_out.write(f'##total={total_entries}\n') + for entry, depths in coverages[fasta].items(): + coverage_out.write(f'##contig=\n') + for depth in depths: + coverage_out.write(f'{depth}\n') + \ No newline at end of file diff --git a/modules/variant_calling/call_variants_auto/bin/update-conda.sh b/modules/variant_calling/call_variants_auto/bin/update-conda.sh new file mode 100755 index 000000000..5ef7f31c4 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/update-conda.sh @@ -0,0 +1,67 @@ +#! /bin/bash +# Updates the conda environment yamls to bump to latest software versions. +set -x +set -e +if [[ $# == 0 ]]; then + echo "" + echo "update-conda.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-conda.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi + + +CONDA_DIR=$1/conda +DOCKER_DIR=$1/containers +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + CONDA_DIR="${CONDA_DIR}/mac" + IS_MAC=1 +else + echo "Creating Linux yamls" + CONDA_DIR="${CONDA_DIR}/linux" +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: docker dir, 5: version, 6: is_mac + echo "Working on ${1}" + + if [ "$6" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5 -r ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} > ${3}/${1}.yml + md5sum ${3}/${1}.yml | cut -d " " -f 1 > ${3}/${1}.md5 + head -n 1 ${3}/${1}.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${4}/${1}.Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +update_environment "annotate_genome" "prokka pigz tbl2asn-forever" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "antimicrobial_resistance" "ncbi-amrfinderplus" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "ariba_analysis" "ariba bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assemble_genome" "shovill-se assembly-scan unicycler pigz bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "assembly_qc" "checkm-genome quast pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +if [ "${IS_MAC}" == "1" ]; then + update_environment "call_variants" "snippy vcf-annotator pigz vt" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +else + update_environment "call_variants" "snippy vcf-annotator pigz vt=2015.11.10=he941832_3" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +fi +update_environment "count_31mers" "mccortex" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "download_references" "ncbi-genome-download mash biopython python>3.6 rename" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "gather_fastqs" "art rename ncbi-genome-download fastq-dl biopython" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "minmers" "mash sourmash" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "qc_reads" "bbmap fastqc fastq-scan lighter pigz" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} +update_environment "sequence_type" "ariba blast bowtie2=2.3.5.1" ${CONDA_DIR} ${DOCKER_DIR} ${VERSION} ${IS_MAC} + +echo "Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/variant_calling/call_variants_auto/bin/update-docker.sh b/modules/variant_calling/call_variants_auto/bin/update-docker.sh new file mode 100755 index 000000000..2695ce5f7 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/update-docker.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# update-docker +# +# Automate the building of Bactopia related Docker containers +set -e +BACTOPIA_DIR=${1:-"./"} +REPOSITORY=${2:-""} +PRUNE=${3:-"0"} +VERSION=1.6.0 +CONTAINER_VERSION="${VERSION%.*}.x" + +function docker_build { + recipe=$1 + image=$2 + latest=${3:-0} + + echo "Working on ${image}" + docker build --rm -t ${image} -f ${recipe} . + + # Push to DockerHub + echo "Pushing ${image}" + docker push ${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${latest}" + docker tag ${image} ${latest} + docker push ${latest} + fi + + # Push to optional repos + for repo in ${REPOSITORY}; do + echo "Pushing ${repo}/${image}" + docker tag ${image} ${repo}/${image} + docker push ${repo}/${image} + + if [[ "${latest}" != "0" ]]; then + echo "Pushing ${repo}/${latest}" + docker tag ${image} ${repo}/${latest} + docker push ${repo}/${latest} + fi + done + + if [[ "${PRUNE}" == "1" ]]; then + echo "Pruning Docker Cache" + docker image prune -a -f + df -h + fi +} + +# Build Bactopia Container +docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest + +# Build Process Containers +for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do + recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}" + recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//') + recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}" + conda_yaml="${BACTOPIA_DIR}/conda/linux/${recipe}.md5" + docker_build ${recipe_path} ${recipe_image} +done + +# Build Bactopia Tools containers +for tool in $(ls "${BACTOPIA_DIR}/tools"); do + recipe_path="${BACTOPIA_DIR}/tools/${tool}" + if [ -f "${BACTOPIA_DIR}/tools/${tool}/environment-linux.yml" ]; then + docker_file="${recipe_path}/Dockerfile" + docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}" + docker_build ${docker_file} ${docker_image} + fi +done diff --git a/modules/variant_calling/call_variants_auto/bin/update-tools.sh b/modules/variant_calling/call_variants_auto/bin/update-tools.sh new file mode 100755 index 000000000..75bec7fa2 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/update-tools.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# Updates the conda environment yamls for Bactopia Tools to bump to latest software versions. + +if [[ $# == 0 ]]; then + echo "" + echo "update-tools.sh BACTOPIA_DIRECTORY VERSION IS_MAC" + echo "" + echo "Example Command" + echo "update-tools.sh /home/bactopia/bactopia 1.0.0" + echo "" + exit +fi +CONDA_DIR="${1}/tools" +VERSION=$2 +IS_MAC=0 +if [ "$3" == "1" ]; then + echo "Creating Mac OS X yamls" + IS_MAC=1 +fi + +function update_environment { + # 1: template, 2: programs, 3: conda dir, 4: version, 5: is_mac + echo "Working on ${1}" + + YAML="${3}/${1}/environment" + if [ "$5" == 1 ]; then + # Mac OS + # Have to replace Mac versions of some programs (date, sed, etc...) + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} coreutils sed + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -E 's=channels:=version: '"${4}"'\'$'\nchannels:=' > ${YAML}-osx.yml + md5 -r ${YAML}-osx.yml | cut -d " " -f 1 > ${YAML}-osx.md5 + else + # Linux + conda create --quiet -y -n bactopia-${1} ${6} -c conda-forge -c bioconda ${2} + conda env export --no-builds -n bactopia-${1} | \ + grep -v "^prefix:" | \ + sed -r 's=channels:=version: '"${4}"'\nchannels:=' > ${YAML}-linux.yml + md5sum ${YAML}-linux.yml | cut -d " " -f 1 > ${YAML}-linux.md5 + head -n 1 ${YAML}-linux.md5 | xargs -I {} sed -i -E 's/(LABEL conda.md5=")(.*)(")/\1{}\3/' ${3}/${1}/Dockerfile + fi + + conda env remove -n bactopia-${1} +} + +# Bactopia environments +update_environment "eggnog" "eggnog-mapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "fastani" "fastani ncbi-genome-download rename sed" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "gtdb" "gtdbtk" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "ismapper" "ismapper" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "mashtree" "mashtree ncbi-genome-download rename" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "phyloflash" "phyloflash mafft iqtree pigz" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "pirate" "bioconductor-ggtree clonalframeml iqtree maskrc-svg ncbi-genome-download pigz pirate prokka r-dplyr r-ggplot2 r-gridextra r-phangorn rename snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "roary" "clonalframeml iqtree maskrc-svg ncbi-genome-download pigz prokka r-ggplot2 rename roary snp-dists tbl2asn-forever" ${CONDA_DIR} ${VERSION} ${IS_MAC} +update_environment "summary" "executor jinja2" ${CONDA_DIR} ${VERSION} ${IS_MAC} + +echo "Conda Last updated: " `date` > ${CONDA_DIR}/README.md diff --git a/modules/variant_calling/call_variants_auto/bin/update-version.sh b/modules/variant_calling/call_variants_auto/bin/update-version.sh new file mode 100755 index 000000000..7571755a1 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/bin/update-version.sh @@ -0,0 +1,89 @@ +#! /bin/bash +# Updates the version numbers across the Bactopia project. +# If no user input, print usage + +function generic_update { + ${1} -r 's/'"${2}"'/'"${3}"'/' ${4} +} + +function python_update { + ${1} -r 's/VERSION = "'"${2}"'"/VERSION = "'"${3}"'"/' ${4} +} + +function conda_update { + ${1} -r 's=version: '"${2}"'$=version: '"${3}"'=' ${4} +} + +function shell_update { + ${1} 's/VERSION='"${2}"'/VERSION='"${3}"'/' ${4} +} + +if [[ $# == 0 ]]; then + echo "" + echo "update-version.sh BACTOPIA_DIRECTORY OLD_VERSION NEW_VERSION" + echo "" + echo "Example Command" + echo "update-version.sh /home/bactopia/bactopia 1.0.0 1.0.1" + echo "" + exit +fi + + +DIRECTORY=$1 +OLD_VERSION=$2 +NEW_VERSION=$3 +OLD_CONTAINER="${OLD_VERSION%.*}.x" +NEW_CONTAINER="${NEW_VERSION%.*}.x" + +if [ -z ${DIRECTORY} ] || [ -z ${OLD_VERSION} ] || [ -z ${NEW_VERSION} ]; then + echo "Got ${#} arguement" + echo "Must give a directory, old version and new version" + exit 1 +fi + +SED_CMD="echo sed -i" +if [ "$4" == "1" ]; then + echo "In-Place edits ENABLED" + SED_CMD="sed -i" +else + echo "In-Place edits DISABLED (e.g. no changes will be made)" +fi + +# Test $DIRECTORY points to bactopia repo +/bin/bash ${DIRECTORY}/bactopia 1> /dev/null 2> /dev/null + +if [ $? -eq 0 ]; then + IGNORE=${DIRECTORY}/data/version-ignore.txt + EXCLUDE=${DIRECTORY}/data/version-excludes.txt + for file in $(find -type f | grep -v -f ${IGNORE} | xargs -I {} grep -i -H "version" {} | grep -v -f ${EXCLUDE} | cut -d ":" -f 1 | sort | uniq); do + if [[ "${file}" == *"bactopia" ]]; then + # bactopia + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".version" ]]; then + # Conda + conda_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Dockerfile" ]]; then + # Docker + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *"nextflow.config" ]]; then + # Nextflow Config + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + generic_update "${SED_CMD}" ${OLD_CONTAINER} ${NEW_CONTAINER} ${file} + elif [[ "${file}" == *"Singularity" ]]; then + # Singularity + generic_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".py" ]]; then + # Python + python_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + elif [[ "${file}" == *".sh" ]]; then + # Shell + shell_update "${SED_CMD}" ${OLD_VERSION} ${NEW_VERSION} ${file} + else + echo "Unknown: ${file}" + fi + done +else + echo "Unable to execute '${DIRECTORY}/bactopia" + echo "Please verify '${DIRECTORY}' points to the bactopia repo." + exit 1 +fi diff --git a/modules/variant_calling/call_variants_auto/call_variants_auto.nf b/modules/variant_calling/call_variants_auto/call_variants_auto.nf new file mode 100644 index 000000000..a775bf5bc --- /dev/null +++ b/modules/variant_calling/call_variants_auto/call_variants_auto.nf @@ -0,0 +1,52 @@ +nextflow.enable.dsl = 2 + +process CALL_VARIANTS_AUTO { + /* + Identify variants (SNPs/InDels) against one or more reference genomes selected based + on their Mash distance from the input. + */ + tag "${sample} - ${reference_name}" + + publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*" + publishDir "${outdir}/${sample}/variants/auto", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${reference_name}/*" + + input: + tuple val(sample), val(single_end), path(fq), path(reference) + + output: + path "${reference_name}/*" + path "${task.process}/*" optional true + + shell: + snippy_ram = task.memory.toString().split(' ')[0] + reference_name = reference.getSimpleName().split("${sample}-")[1].split(/\./)[0] + fastq = single_end ? "--se ${fq[0]}" : "--R1 ${fq[0]} --R2 ${fq[1]}" + bwaopt = params.bwaopt ? "--bwaopt 'params.bwaopt'" : "" + fbopt = params.fbopt ? "--fbopt 'params.fbopt'" : "" + template "call_variants_auto.sh" + + stub: + reference_name = "ref_name" + """ + echo True + mkdir ${reference_name} + mkdir ${task.process} + touch ${reference_name}/* + touch ${task.process}/* + """ +} + + +//############### +//Module testing +//############### + +workflow test { + TEST_PARAMS_CH = Channel.of([ + params.sample, + params.single_end, + path(params.fq), + path(params.reference) + ]) + call_variants_auto(TEST_PARAMS_CH) +} diff --git a/modules/variant_calling/call_variants_auto/nextflow.config b/modules/variant_calling/call_variants_auto/nextflow.config new file mode 100644 index 000000000..a27358adf --- /dev/null +++ b/modules/variant_calling/call_variants_auto/nextflow.config @@ -0,0 +1,49 @@ +manifest { + author = 'Robert A. Petit III' + name = 'bactopia' + homePage = 'https://github.com/bactopia/bactopia' + description = 'An extensive workflow for processing Illumina sequencing of bacterial genomes.' + mainScript = 'main.nf' + version = '1.6.0' + nextflowVersion = '>=19' +} + + +profiles { + + conda { + process { + withName: call_variants_auto { + conda = "${baseDir}/../../../conda/envs/call_variants-1.7.x"} + } + } + + docker { + process { + withName: call_variants_auto { + container = "ghcr.io/bactopia/call_variants:1.6.0"} + + } + } + + test { + + process { + withName: call_variants_auto { + cpus = 2 + memory = "10 GB" + queue = 'long' + } + + } + env { + VERSION = "1.6.0" + outdir = "test_output" + sample = "SRR2838702" + final_sample_type = "paired-end" + single_end = false + run_type = "fastqs" + } + + } +} diff --git a/modules/variant_calling/call_variants_auto/templates/call_variants_auto.sh b/modules/variant_calling/call_variants_auto/templates/call_variants_auto.sh new file mode 100644 index 000000000..3309e5302 --- /dev/null +++ b/modules/variant_calling/call_variants_auto/templates/call_variants_auto.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -e +set -u +LOG_DIR="!{task.process}" +mkdir -p ${LOG_DIR} +echo "# Timestamp" > ${LOG_DIR}/!{task.process}.versions +date --iso-8601=seconds >> ${LOG_DIR}/!{task.process}.versions +echo "# Snippy Version" >> ${LOG_DIR}/!{task.process}.versions +snippy --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 + +# Print captured STDERR incase of exit +function print_stderr { + cat .command.err 1>&2 + ls ${LOG_DIR}/ | grep ".err" | xargs -I {} cat ${LOG_DIR}/{} 1>&2 +} +trap print_stderr EXIT + +# Verify AWS files were staged +if [[ ! -L "!{fq[0]}" ]]; then + if [ "!{single_end}" == "true" ]; then + check-staging.py --fq1 !{fq[0]} --extra !{reference} --is_single + else + check-staging.py --fq1 !{fq[0]} --fq2 !{fq[1]} --extra !{reference} + fi +fi + +snippy !{fastq} \ + --ref !{reference} \ + --cpus !{task.cpus} \ + --ram !{snippy_ram} \ + --outdir !{reference_name} \ + --prefix !{sample} \ + --mapqual !{params.mapqual} \ + --basequal !{params.basequal} \ + --mincov !{params.mincov} \ + --minfrac !{params.minfrac} \ + --minqual !{params.minqual} \ + --maxsoft !{params.maxsoft} !{bwaopt} !{fbopt} > ${LOG_DIR}/snippy.out 2> ${LOG_DIR}/snippy.err + +# Add GenBank annotations +echo "# vcf-annotator Version" >> ${LOG_DIR}/!{task.process}.versions +vcf-annotator --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +vcf-annotator !{reference_name}/!{sample}.vcf !{reference} > !{reference_name}/!{sample}.annotated.vcf 2> ${LOG_DIR}/vcf-annotator.err + +# Get per-base coverage +echo "# bedtools Version" >> ${LOG_DIR}/!{task.process}.versions +bedtools --version >> ${LOG_DIR}/!{task.process}.versions 2>&1 +grep "^##contig" !{reference_name}/!{sample}.vcf > !{reference_name}/!{sample}.full-coverage.txt +genomeCoverageBed -ibam !{reference_name}/!{sample}.bam -d >> !{reference_name}/!{sample}.full-coverage.txt 2> ${LOG_DIR}/genomeCoverageBed.err +cleanup-coverage.py !{reference_name}/!{sample}.full-coverage.txt > !{reference_name}/!{sample}.coverage.txt +rm !{reference_name}/!{sample}.full-coverage.txt + +echo "here 6" +# Mask low coverage regions +mask-consensus.py !{sample} !{reference_name} \ + !{reference_name}/!{sample}.consensus.subs.fa \ + !{reference_name}/!{sample}.subs.vcf \ + !{reference_name}/!{sample}.coverage.txt \ + --mincov !{params.mincov} +echo "here 7" +# Clean Up +rm -rf !{reference_name}/reference !{reference_name}/ref.fa* !{reference_name}/!{sample}.vcf.gz* +echo "here 8" +if [[ !{params.compress} == "true" ]]; then + find !{reference_name}/ -type f -not -name "*.bam*" -and -not -name "*.log*" -and -not -name "*.txt*" | \ + xargs -I {} pigz -n --best -p !{task.cpus} {} + pigz -n --best -p !{task.cpus} !{reference_name}/!{sample}.coverage.txt +fi + +if [ "!{params.skip_logs}" == "false" ]; then + cp .command.err ${LOG_DIR}/!{task.process}.err + cp .command.out ${LOG_DIR}/!{task.process}.out + cp .command.sh ${LOG_DIR}/!{task.process}.sh || : + cp .command.trace ${LOG_DIR}/!{task.process}.trace || : +else + rm -rf ${LOG_DIR}/ +fi diff --git a/modules/variant_calling/call_variants_auto/test_params.yaml b/modules/variant_calling/call_variants_auto/test_params.yaml new file mode 100644 index 000000000..0ff3e080f --- /dev/null +++ b/modules/variant_calling/call_variants_auto/test_params.yaml @@ -0,0 +1,56 @@ +outdir: + "test_output" + +sample: + "SRR2838702" + +single_end: + false + +fq: + "test_data/SRR2838702_R{1,2}.fastq.gz" + +reference: + "test_data/SRR2838702-GCF_000009005.gbk" + +publish_mode: + "copy" + +run_type: + "fastqs" + +version: + "1.6.0" + +overwrite: + false + +mapqual: + 60 + +basequal: + 13 + +mincov: + 10 + +minfrac: + 0 + +minqual: + 100 + +maxsoft: + 10 + +bwaopt: + null + +fbopt: + null + +compress: + false + +skip_logs: + false diff --git a/nextflow.config b/nextflow.config index d7c030f72..052f976c4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -78,7 +78,8 @@ def check_max(obj, max, type) { if (obj == 'request') { return max } else { - return Math.min(obj, max) + //return Math.min(obj, max) <- Error found + return 2 } } catch (all) { println "ERROR - Max cpus '${Math.min(obj, max)}' is not valid! Using default value: ${max}"