diff --git a/CITATIONS.md b/CITATIONS.md index 4f03aaa..5c5643e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,13 +10,51 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [OMA](htpps://omabrowser.org) - > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +> Adrian M Altenhoff, Clément-Marie Train, Kimberly J Gilbert, Ishita Mediratta, Tarcisio Mendes de Farias, David Moi, Yannis Nevers, Hale-Seda Radoykova, Victor Rossier, Alex Warwick Vesztrocy, Natasha M Glover, Christophe Dessimoz, OMA orthology in 2021: website overhaul, conserved isoforms, ancestral gene order and more, Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D373–D379, https://doi.org/10.1093/nar/gkaa1007 -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +- [PANTHER](https://pantherdb.org) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +> Thomas PD, Ebert D, Muruganujan A, Mushayahama T, Albou L-P, Mi H. PANTHER: Making genome-scale phylogenetics accessible to all. Protein Science. 2022; 31: 8–22. https://doi.org/10.1002/pro.4218 + +- [OrthoInspector](https://lbgi.fr/orthoinspector) + +> Yannis Nevers, Arnaud Kress, Audrey Defosset, Raymond Ripp, Benjamin Linard, Julie D Thompson, Olivier Poch, Odile Lecompte, OrthoInspector 3.0: open portal for comparative genomics, Nucleic Acids Research, Volume 47, Issue D1, 08 January 2019, Pages D411–D418, https://doi.org/10.1093/nar/gky1068 + +- [EggNOG](https://eggnog5.embl.de) + +> Jaime Huerta-Cepas, Damian Szklarczyk, Davide Heller, Ana Hernández-Plaza, Sofia K Forslund, Helen Cook, Daniel R Mende, Ivica Letunic, Thomas Rattei, Lars J Jensen, Christian von Mering, Peer Bork, eggNOG 5.0: a hierarchical, functionally and phylogenetically annotated orthology resource based on 5090 organisms and 2502 viruses, Nucleic Acids Research, Volume 47, Issue D1, 08 January 2019, Pages D309–D314, https://doi.org/10.1093/nar/gky1085 + +- [UniProt](https://uniprot.org) + +> The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, https://doi.org/10.1093/nar/gkac1052 + +- [UniProt ID Mapping](https://uniprot.org/id-mapping) + +> Huang H, McGarvey PB, Suzek BE, Mazumder R, Zhang J, Chen Y, Wu CH. A comprehensive protein-centric ID mapping service for molecular data integration. Bioinformatics. 2011 Apr 15;27(8):1190-1. doi: 10.1093/bioinformatics/btr101. PMID: 21478197; PMCID: PMC3072559. + +- [AlphaFold](https://deepmind.google/technologies/alphafold) + +> Jumper, J., Evans, R., Pritzel, A. et al. Highly accurate protein structure prediction with AlphaFold. Nature 596, 583–589 (2021). https://doi.org/10.1038/s41586-021-03819-2 + +- [AlphaFold Database](https://alphafold.ebi.ac.uk) + +> Mihaly Varadi, Stephen Anyango, Mandar Deshpande, Sreenath Nair, Cindy Natassia, Galabina Yordanova, David Yuan, Oana Stroe, Gemma Wood, Agata Laydon, Augustin Žídek, Tim Green, Kathryn Tunyasuvunakool, Stig Petersen, John Jumper, Ellen Clancy, Richard Green, Ankur Vora, Mira Lutfi, Michael Figurnov, Andrew Cowie, Nicole Hobbs, Pushmeet Kohli, Gerard Kleywegt, Ewan Birney, Demis Hassabis, Sameer Velankar, AlphaFold Protein Structure Database: massively expanding the structural coverage of protein-sequence space with high-accuracy models, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D439–D444, https://doi.org/10.1093/nar/gkab1061 + +- [T-COFFEE](https://tcoffee.org) + +> Notredame C, Higgins DG, Heringa J. T-Coffee: A novel method for fast and accurate multiple sequence alignment. J Mol Biol. 2000 Sep 8;302(1):205-17. doi: 10.1006/jmbi.2000.4042. PMID: 10964570. + +- [IQTREE](https://iqtree.org) + +> B.Q. Minh, H.A. Schmidt, O. Chernomor, D. Schrempf, M.D. Woodhams, A. von Haeseler, R. Lanfear (2020) IQ-TREE 2: New models and efficient methods for phylogenetic inference in the genomic era. Mol. Biol. Evol., 37:1530-1534. https://doi.org/10.1093/molbev/msaa015 + +> D.T. Hoang, O. Chernomor, A. von Haeseler, B.Q. Minh, L.S. Vinh (2018) UFBoot2: Improving the ultrafast bootstrap approximation. Mol. Biol. Evol., 35:518–522. https://doi.org/10.1093/molbev/msx281 + +- [FastME](https://atgc-montpellier.fr/fastme/) + +> Vincent Lefort, Richard Desper, Olivier Gascuel, FastME 2.0: A Comprehensive, Accurate, and Fast Distance-Based Phylogeny Inference Program, Molecular Biology and Evolution, Volume 32, Issue 10, October 2015, Pages 2798–2800, https://doi.org/10.1093/molbev/msv150 ## Software packaging/containerisation tools diff --git a/README.md b/README.md index e7607ea..0186d5b 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,6 @@ ![nf-core-reportho tube map](docs/images/reportho_tube_map.svg?raw=true "nf-core-reportho tube map") - - 1. **Obtain Query Information**: (depends on provided input) identification of Uniprot ID and taxon ID for the query or its closest homolog. 2. **Fetch Orthologs**: fetching of ortholog predictions from public databases, either through API or from local snapshot. 3. **Compare and Assemble**: calculation of agreement statistics, creation of ortholog lists, selection of the consensus list. @@ -66,8 +64,6 @@ If using the latter format, you must set `--uniprot_query` to true. Now, you can run the pipeline using: - - ```bash nextflow run nf-core/reportho \ -profile \ @@ -89,15 +85,13 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/reportho was originally written by itrujnara. +nf-core/reportho was originally written by Igor Trujnara (@itrujnara). We thank the following people for their extensive assistance in the development of this pipeline: -@lsantus - -@avignoli - -@JoseEspinosa +- Luisa Santus (@lsantus) +- Alessio Vignoli (@avignoli) +- Jose Espinosa-Carrasco (@JoseEspinosa) ## Contributions and Support @@ -110,8 +104,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `# - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..2b40ea6 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,2 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +id,query +BicD2,Q8TD16 diff --git a/bin/clustal2fasta.py b/bin/clustal2fasta.py index 8f3de57..2ccad47 100755 --- a/bin/clustal2fasta.py +++ b/bin/clustal2fasta.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from Bio import SeqIO diff --git a/bin/clustal2phylip.py b/bin/clustal2phylip.py index 186fcd0..246b11a 100755 --- a/bin/clustal2phylip.py +++ b/bin/clustal2phylip.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from Bio import SeqIO diff --git a/bin/csv_adorn.py b/bin/csv_adorn.py index b7801ba..2052082 100755 --- a/bin/csv_adorn.py +++ b/bin/csv_adorn.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys diff --git a/bin/ensembl2uniprot.py b/bin/ensembl2uniprot.py index 9097c82..2483dca 100644 --- a/bin/ensembl2uniprot.py +++ b/bin/ensembl2uniprot.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/fetch_afdb_structures.py b/bin/fetch_afdb_structures.py index e57d1b3..c13a6a6 100755 --- a/bin/fetch_afdb_structures.py +++ b/bin/fetch_afdb_structures.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/fetch_inspector_group.py b/bin/fetch_inspector_group.py index e462413..211c08a 100755 --- a/bin/fetch_inspector_group.py +++ b/bin/fetch_inspector_group.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/fetch_oma_by_sequence.py b/bin/fetch_oma_by_sequence.py index 636e6fc..eeab2ba 100755 --- a/bin/fetch_oma_by_sequence.py +++ b/bin/fetch_oma_by_sequence.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from warnings import warn diff --git a/bin/fetch_oma_group.py b/bin/fetch_oma_group.py index 168924f..11e5cd2 100755 --- a/bin/fetch_oma_group.py +++ b/bin/fetch_oma_group.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/fetch_oma_groupid.py b/bin/fetch_oma_groupid.py index 7beafbd..b61898f 100755 --- a/bin/fetch_oma_groupid.py +++ b/bin/fetch_oma_groupid.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from utils import fetch_seq diff --git a/bin/fetch_oma_taxid_by_id.py b/bin/fetch_oma_taxid_by_id.py index 83ef185..18f3286 100755 --- a/bin/fetch_oma_taxid_by_id.py +++ b/bin/fetch_oma_taxid_by_id.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from utils import fetch_seq diff --git a/bin/fetch_panther_group.py b/bin/fetch_panther_group.py index c07034a..4d81b2e 100755 --- a/bin/fetch_panther_group.py +++ b/bin/fetch_panther_group.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/fetch_sequences.py b/bin/fetch_sequences.py index e392024..8f5a11c 100755 --- a/bin/fetch_sequences.py +++ b/bin/fetch_sequences.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/filter_fasta.py b/bin/filter_fasta.py index 6840885..b6348ca 100755 --- a/bin/filter_fasta.py +++ b/bin/filter_fasta.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from Bio import SeqIO diff --git a/bin/get_oma_version.py b/bin/get_oma_version.py index d0d70f8..d75619b 100755 --- a/bin/get_oma_version.py +++ b/bin/get_oma_version.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import requests diff --git a/bin/make_score_table.py b/bin/make_score_table.py index 68efe87..ccea2df 100755 --- a/bin/make_score_table.py +++ b/bin/make_score_table.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import csv import re import sys diff --git a/bin/make_stats.py b/bin/make_stats.py index 8a51181..7a0bf26 100755 --- a/bin/make_stats.py +++ b/bin/make_stats.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import csv import sys diff --git a/bin/map_uniprot.py b/bin/map_uniprot.py index d556f73..dd74a16 100644 --- a/bin/map_uniprot.py +++ b/bin/map_uniprot.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from ensembl2uniprot import ensembl2uniprot diff --git a/bin/oma2uniprot_local.py b/bin/oma2uniprot_local.py index 95b2213..19c605b 100755 --- a/bin/oma2uniprot_local.py +++ b/bin/oma2uniprot_local.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import gzip import sys diff --git a/bin/plot_orthologs.R b/bin/plot_orthologs.R index c533d75..34c7219 100755 --- a/bin/plot_orthologs.R +++ b/bin/plot_orthologs.R @@ -1,5 +1,8 @@ #!/usr/bin/env Rscript +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + suppressMessages(library(ggplot2)) suppressMessages(library(reshape2)) suppressMessages(library(dplyr)) @@ -15,6 +18,7 @@ if (length(args) < 2) { # Styles text_color <- "#DDDDDD" bg_color <- "transparent" +font_size <- 16 # Load the data data <- read.csv(args[1], header = TRUE, stringsAsFactors = FALSE) @@ -38,9 +42,9 @@ p <- ggplot(melted_crosstable, aes(x = method, y = count, fill = score)) + labs(title = "Support for predictions", x = "Database", y = "Number of orthologs", fill = "Support") + scale_fill_manual(values = c("#59B4C3", "#74E291", "#8F7AC2", "#EFF396", "#FF9A8D")) + theme(legend.position = "right", - text = element_text(size = 12, color = text_color), - axis.text.x = element_text(color = text_color), - axis.text.y = element_text(color = text_color), + text = element_text(size = font_size, color = text_color), + axis.text.x = element_text(size = font_size, color = text_color), + axis.text.y = element_text(size = font_size, color = text_color), plot.background = element_rect(color = bg_color, fill = bg_color), panel.background = element_rect(color = bg_color, fill = bg_color)) @@ -54,7 +58,7 @@ for (i in colnames(data)[4:ncol(data)-1]) { } venn.plot <- ggVennDiagram(venn.data, set_color = text_color) + theme(legend.position = "none", - text = element_text(size = 12, color = text_color), + text = element_text(size = font_size, color = text_color), plot.background = element_rect(color = bg_color, fill = bg_color), panel.background = element_rect(color = bg_color, fill = bg_color)) ggsave(paste0(args[2], "_venn.png"), plot = venn.plot, width = 6, height = 6, dpi = 300) @@ -81,9 +85,9 @@ p <- ggplot(jaccard, aes(x = method1, y = method2, fill = jaccard)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(title = "Jaccard Index", x = "", y = "", fill = "Jaccard Index") + theme(legend.position = "right", - text = element_text(size = 12, color = text_color), - axis.text.x = element_text(color = text_color), - axis.text.y = element_text(color = text_color), + text = element_text(size = font_size, color = text_color), + axis.text.x = element_text(size = font_size, color = text_color), + axis.text.y = element_text(size = font_size, color = text_color), plot.background = element_rect(color = bg_color, fill = bg_color), panel.background = element_rect(color = bg_color, fill = bg_color)) diff --git a/bin/plot_tree.R b/bin/plot_tree.R index 945ff90..dc92ab6 100755 --- a/bin/plot_tree.R +++ b/bin/plot_tree.R @@ -1,5 +1,8 @@ #!/usr/bin/env Rscript +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + library(treeio) library(ggtree) library(ggplot2) diff --git a/bin/refseq2uniprot.py b/bin/refseq2uniprot.py index fa62edd..fe3ef0d 100644 --- a/bin/refseq2uniprot.py +++ b/bin/refseq2uniprot.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/score_hits.py b/bin/score_hits.py index aa4ccee..7ad39cc 100755 --- a/bin/score_hits.py +++ b/bin/score_hits.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import csv import sys diff --git a/bin/uniprot2oma_local.py b/bin/uniprot2oma_local.py index f816bb0..ee97ca3 100755 --- a/bin/uniprot2oma_local.py +++ b/bin/uniprot2oma_local.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import gzip import sys diff --git a/bin/uniprot2uniprot.py b/bin/uniprot2uniprot.py index a7c0e01..dbe3242 100644 --- a/bin/uniprot2uniprot.py +++ b/bin/uniprot2uniprot.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys import requests diff --git a/bin/uniprotize_oma_local.py b/bin/uniprotize_oma_local.py index 16317d4..3e12da9 100755 --- a/bin/uniprotize_oma_local.py +++ b/bin/uniprotize_oma_local.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import gzip import sys diff --git a/bin/uniprotize_oma_online.py b/bin/uniprotize_oma_online.py index 9b9a6df..91f26e2 100755 --- a/bin/uniprotize_oma_online.py +++ b/bin/uniprotize_oma_online.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + import sys from map_uniprot import map_uniprot diff --git a/bin/utils.py b/bin/utils.py index cebe0e7..3bfc95a 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -1,3 +1,7 @@ +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details +# Includes code written by UniProt contributors published under CC-BY 4.0 license + import time from typing import Any diff --git a/bin/yml2csv.py b/bin/yml2csv.py new file mode 100755 index 0000000..27842b8 --- /dev/null +++ b/bin/yml2csv.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +# Written by Igor Trujnara, released under the MIT license +# See https://opensource.org/license/mit for details + +import sys + +import yaml + + +def main() -> None: + if len(sys.argv) < 4: + print("Usage: yml2csv.py ") + sys.exit(1) + + sample_id = sys.argv[1] + input_file = sys.argv[2] + output_file = sys.argv[3] + + with open(input_file) as f: + data = yaml.safe_load(f) + + with open(output_file, "w") as f: + print("id,percent_max,percent_privates,goodness", file=f) + print(f"{sample_id},{data['percent_max']},{data['percent_privates']},{data['goodness']}", file=f) + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index 367a3a3..a81954b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -103,6 +103,24 @@ process { ] } + withName: 'STATS2CSV' { + publishDir = [ + path: { "${params.outdir}/orthologs/stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGE_STATS' { + ext.args = "-u NA" + ext.prefix = "aggregated_stats" + publishDir = [ + path: { "${params.outdir}/orthologs/stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + // ---------------------- // Sequence alignment // ---------------------- diff --git a/conf/test.config b/conf/test.config index 2cf94b1..fc9ded3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,12 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv' - // Genome references - genome = 'R64-1-1' + // Other parameters + uniprot_query = true + skip_eggnog = true + min_score = 3 + skip_iqtree = true + fastme_bootstrap = 0 } diff --git a/conf/test_full.config b/conf/test_full.config index 87e7fee..2f59347 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,11 +14,13 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv' - // Genome references - genome = 'R64-1-1' + // Other parameters + uniprot_query = true + eggnog_path = 'http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/1/1_members.tsv.gz' + eggnog_idmap_path = "http://eggnog5.embl.de/download/eggnog_5.0/id_mappings/uniprot/latest.Eukaryota.tsv.gz" + min_score = 3 + use_structures = true } diff --git a/docs/output.md b/docs/output.md index ab52940..e9e6ece 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,58 +2,186 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. Most of the plots are taken from the report, which summarizes results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Query identification](#fastqc) - obtaining basic information on the query +- [Ortholog fetching](#ortholog-fetching) - obtaining ortholog predictions from public databases +- [Ortholog scoring](#ortholog-scoring) - creation of a score table +- [Ortholog filtering](#ortholog-filtering) - selection of final ortholog list +- [Ortholog plotting](#ortholog-plotting) - creation of plots describing the predictions +- [Ortholog statistics](#ortholog-statistics) - calculation of several statistics about the predictions +- [Sequence fetching](#sequence-fetching) - obtaining ortholog sequences form public databases +- [Structure fetching](#structure-fetching) - obtaining ortholog structures from AlphaFoldDB +- [MSA](#msa) - alignment of ortholog sequences +- [Tree reconstruction](#tree-reconstruction) - creation of phylogenies with ML or ME +- [Report generation](#report-generation) - creation of a human-readable report +- [Pipeline information](#pipeline-information) - basic information about the pipeline run + +### Query identification + +
+Output files -### FastQC +- `seqinfo/` + - `*_id.txt`: File containing Uniprot identifier of the query or the closest BLAST hit. + - `*_taxid.txt`: File containing NCBI taxon ID of the query/closest hit. + - `*_exact.txt`: File containing information on whether the query was found in the database (`true`), or the output is the top BLAST hit (`false`). +
+ +Query information necessary for further steps is obtained here. If a sequence was passed, it is identified using [OMA](https://omabrowser.org). A Uniprot identifier is obtained, along with indication whether it was an exact or closest match. For either query type, an NCBI taxon ID is obtained using the OMA API. + +### Ortholog fetching
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `orthologs/` + - `[dbname]/` + - `*_[dbname]_group.csv`: A CSV file with the hits from the database. It has an additional column necessary for later merging. +
- +Ortholog predictions are fetched from the databases. Each database can be used locally or online, subject to the feasibility of these access modes. The databases currently supported are: -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +- OMA (online and local) +- PANTHER (online and local) +- OrthoInspector (online) +- EggNOG (local). -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +### Ortholog scoring -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `orthologs/` + - `merge_csv/` + - `*.csv`: A merged CSV file with predictions from all the databases. + - `score_table/` + - `*_score_table.csv`: A merged CSV with a score column added. The score is the number of databases supporting the prediction. +
-:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +At this step, the predictions are combined into a single table. They are also assigned a score which is used for later filtering. The score is the number of supporting sources. -### MultiQC +### Ortholog filtering
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `orthologs/` + - `filter_hits/` + - `*_minscore_*.txt`: Lists of predictions passing different score thresholds, from 1 to the number of sources. For example, `BicD2_minscore_2.txt` would include orthologs of BicD2 supported by at least 2 sources. + - `*_centroid.txt`: A list of predictions from the source with the highest agreement with other sources. + - `*_filtered_hits.txt`: The final list of orthologs, chosen based on user-defined criteria. +
- +In this step, the predictions are split into lists with different minimal scores, indicating each level of support. Additionally, the source with the highest total agreement is found. + +The final list of orthologs is determined in one of two ways. If `--use_centroid` is set, the highest-agreement source will be used. Otherwise, orthologs with a score higher than `--min_score` are used. + +### Ortholog plotting + +
+Output files + +- `orthologs/` + - `plots/` + - `*_supports.png`: A bar plot representing the number of predictions from each source and the support of the predictions. + - `*_venn.png`: A Venn diagram representing the intersections between databases. + - `*_jaccard.png`: A tile plot representing the Jaccard index (pairwise agreement) between databases. +
+ +Plots representing certain aspects of the predictions are generated using `ggplot`. + +### Ortholog statistics + +
+Output files + +- `orthologs/` + - `stats/` + - `*_stats.yml`: A YAML file containing ortholog statistics. +
+ +The following statistics of the predictions are calculated: + +- percentage of consensus - the fraction of predictions which are supported by all the sources +- percentage of privates - the fractions of predictions which are supported by only 1 source +- goodness - the ratio of the real sum of scores to the theoretical maximum (i.e. the number of databases times the number of predictions). + +### Sequence fetching + +
+Output files + +- `sequences/` + - `*_orthologs.fa`: A FASTA file containing all ortholog sequences that could be found. + - `*_seq_hits.txt`: The list of all orthologs whose sequence was found. + - `*_seq_misses.txt`: The list of all orthologs whose sequence was not found. +
+ +If downstream analysis is performed, protein sequences of all orthologs in FASTA format are fetched. The primary source of sequences is [OMA](http://omabrowser.org) due to its fast API. IDs not found in OMA are sent to [Uniprot](http://uniprot.org). Anything not found in Uniprot is considered a miss. + +### Structure fetching + +
+Output files + +- `sequences/` + - `*.pdb`: PDB files with structures of the orthologs, obtained from AlphaFoldDB. + - `*_af_versions.txt`: Versions of the AlphaFold structures. + - `*_str_hits.txt`: The list of all orthologs whose structure was found. + - `*_str_misses.txt`: The list of all orthologs whose structure was not found. +
+ +If `--use_structures` is set, structures from the alignment are obtained from AlphaFoldDB. For feasibility of AlphaFold structures for MSA, check [Baltzis et al. 2022](http://doi.org/10.1093/bioinformatics/btac625). + +### MSA + +
+Output files + +- `alignment/` + - `*.aln`: A multiple sequence alignment of the orthologs in Clustal format. +
+ +Multiple sequence alignment is performed using [T-COFFEE](https://tcoffee.org). 3D-COFFEE mode is used if `--use_structures` is set. Otherwise, default mode is used. + +### Tree reconstruction + +
+Output files + +- `trees/` + - `iqtree/` + - `*.treefile`: The IQTREE phylogeny in Newick format. + - `*.ufboot`: Bootstrap trees, if generated. + - `fastme/` + - `*.nwk`: The FastME phylogeny in Newick format. + - `*.bootstrap`: The bootstrap trees, if generated. + - `plots/` + - `*_iqtree_tree.png`: The IQTREE phylogeny as an image. + - `*_fastme_tree.png`: The FastME phylogeny as an image. +
+ +The phylogeny can be constructed using maximum likelihood ([IQTREE](http://www.iqtree.org/)) or minimum evolution ([FastME](http://www.atgc-montpellier.fr/fastme/)). + +### Report generation + +
+Output files -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +- `*_dist/` + - `*.html`: The report in HTML format. + - `run.sh`: A script to correctly open the report. + - Other files necessary for the report. +
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +The report is generated in the form of a React application. It must be hosted on localhost to work correctly. This can be done manually or with the run script provided. ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index f673563..fbf1216 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,45 +10,34 @@ ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 2 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample +### Full samplesheet + +The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the table below. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +A final samplesheet file may look something like the one below, with `--uniprot_query` enabled: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +id,query +BicD2,Q8TD16 ``` -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +or the one below, otherwise: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +id,query +BicD2,/home/myuser/data/bicd2.fa ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `id` | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. | +| `query` | The query of the user-specified type. If `--uniprot_query` is `true`, it should be a valid Uniprot accession. Otherwise, it should be a valid path to a FASTA file. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -57,7 +46,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/reportho --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/reportho --input ./samplesheet.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -90,7 +79,6 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` @@ -112,7 +100,7 @@ First, go to the [nf-core/reportho releases page](https://github.com/nf-core/rep This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. :::tip If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. diff --git a/modules.json b/modules.json index a309b53..1e87009 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "csvtk/concat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "csvtk/join": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/local/convert_fasta.nf b/modules/local/convert_fasta.nf index 79cfe51..7b32950 100644 --- a/modules/local/convert_fasta.nf +++ b/modules/local/convert_fasta.nf @@ -1,6 +1,6 @@ process CONVERT_FASTA { tag "$input_file" - label "process_single" + label 'process_single' conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/convert_phylip.nf b/modules/local/convert_phylip.nf index 1591ac6..11dab37 100644 --- a/modules/local/convert_phylip.nf +++ b/modules/local/convert_phylip.nf @@ -1,6 +1,6 @@ process CONVERT_PHYLIP { tag "$input_file" - label "process_single" + label 'process_single' conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/create_tcoffeetemplate.nf b/modules/local/create_tcoffeetemplate.nf index 3d845fb..071c3bf 100644 --- a/modules/local/create_tcoffeetemplate.nf +++ b/modules/local/create_tcoffeetemplate.nf @@ -2,6 +2,10 @@ process CREATE_TCOFFEETEMPLATE { tag "$meta.id" label 'process_low' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + input: tuple val(meta), path(accessory_informations) diff --git a/modules/local/dump_params.nf b/modules/local/dump_params.nf index 0406a15..f354fe2 100644 --- a/modules/local/dump_params.nf +++ b/modules/local/dump_params.nf @@ -1,6 +1,10 @@ process DUMP_PARAMS { tag "$meta.id" - label "process_single" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(exact) diff --git a/modules/local/fetch_afdb_structures.nf b/modules/local/fetch_afdb_structures.nf index 9f3d04b..d560887 100644 --- a/modules/local/fetch_afdb_structures.nf +++ b/modules/local/fetch_afdb_structures.nf @@ -1,6 +1,6 @@ process FETCH_AFDB_STRUCTURES { tag "$meta.id" - label "process_single" + label 'process_single' conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index a227132..32df3da 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -1,6 +1,11 @@ process FETCH_EGGNOG_GROUP_LOCAL { tag "$meta.id" - label "process_short" + label 'process_single' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' : + 'biocontainers/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' }" input: tuple val(meta), path(uniprot_id), path(taxid), path(exact) diff --git a/modules/local/fetch_oma_group_local.nf b/modules/local/fetch_oma_group_local.nf index 4c3d231..db08c55 100644 --- a/modules/local/fetch_oma_group_local.nf +++ b/modules/local/fetch_oma_group_local.nf @@ -1,6 +1,11 @@ process FETCH_OMA_GROUP_LOCAL { tag "$meta.id" - label "process_short" + label 'process_single' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' : + 'biocontainers/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' }" input: tuple val(meta), path(uniprot_id), path(taxid), path(exact) diff --git a/modules/local/fetch_panther_group_local.nf b/modules/local/fetch_panther_group_local.nf index 42948e5..aa178dd 100644 --- a/modules/local/fetch_panther_group_local.nf +++ b/modules/local/fetch_panther_group_local.nf @@ -1,6 +1,11 @@ process FETCH_PANTHER_GROUP_LOCAL { tag "$meta.id" - label "process_short" + label 'process_single' + + conda "conda-forge::python=3.10.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.10' : + 'biocontainers/python:3.10' }" input: tuple val(meta), path(uniprot_id), path(taxid), path(exact) diff --git a/modules/local/fetch_sequences_online.nf b/modules/local/fetch_sequences_online.nf index 2026c66..5242abe 100644 --- a/modules/local/fetch_sequences_online.nf +++ b/modules/local/fetch_sequences_online.nf @@ -1,6 +1,6 @@ process FETCH_SEQUENCES_ONLINE { tag "${meta.id}" - label "process_single" + label 'process_single' conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/filter_fasta.nf b/modules/local/filter_fasta.nf index fa69e30..ecfd20e 100644 --- a/modules/local/filter_fasta.nf +++ b/modules/local/filter_fasta.nf @@ -1,6 +1,11 @@ process FILTER_FASTA { tag "$meta.id" - label "process_single" + label 'process_single' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' : + 'biocontainers/mulled-v2-bc54124b36864a4af42a9db48b90a404b5869e7e:5258b8e5ba20587b7cbf3e942e973af5045a1e59-0' }" input: tuple val(meta), path(fasta), path(structures) diff --git a/modules/local/make_report.nf b/modules/local/make_report.nf index 67a8d57..4d5aacd 100644 --- a/modules/local/make_report.nf +++ b/modules/local/make_report.nf @@ -1,6 +1,6 @@ process MAKE_REPORT { tag "$meta.id" - label "process_single" + label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'docker://itrujnara/orthologs-report:1.0.0' : @@ -48,7 +48,7 @@ process MAKE_REPORT { $iqtree_cmd $fastme_cmd yarn run build - echo "python3 -m http.server 0" > dist/${prefix}_run.sh + echo "python3 -m http.server 0" > dist/run.sh mv dist ${prefix}_dist cat <<- END_VERSIONS > versions.yml diff --git a/modules/local/make_stats.nf b/modules/local/make_stats.nf index a62e9f1..f1e7b3d 100644 --- a/modules/local/make_stats.nf +++ b/modules/local/make_stats.nf @@ -1,6 +1,6 @@ process MAKE_STATS { tag "$meta.id" - label "process_single" + label 'process_single' conda "conda-forge::python=3.11.0 conda-forge::biopython=1.83.0 conda-forge::requests=2.31.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/plot_tree.nf b/modules/local/plot_tree.nf index 509bd59..238df56 100644 --- a/modules/local/plot_tree.nf +++ b/modules/local/plot_tree.nf @@ -1,6 +1,6 @@ process PLOT_TREE { tag "$meta.id" - label "process_single" + label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'docker://itrujnara/plot-tree:1.0.0' : diff --git a/modules/local/stats2csv.nf b/modules/local/stats2csv.nf new file mode 100644 index 0000000..362ff42 --- /dev/null +++ b/modules/local/stats2csv.nf @@ -0,0 +1,31 @@ +process STATS2CSV { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::python=3.11.0 conda-forge::pyyaml=5.4.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-deac90960ddeb4d14fb31faf92c0652d613b3327:10b46d090d02e9e22e206db80d14e994267520c3-0' : + 'biocontainers/mulled-v2-deac90960ddeb4d14fb31faf92c0652d613b3327:10b46d090d02e9e22e206db80d14e994267520c3-0' }" + + input: + tuple val(meta), path(stats) + + output: + tuple val(meta), path("*_stats.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: meta.id + """ + yml2csv.py ${meta.id} $stats ${prefix}_stats.csv + + cat <<- END_VERSIONS > versions.yml + "${task.process}": + Python: \$(python --version | cut -d ' ' -f 2) + PyYAML: \$(pip show pyyaml | grep Version | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/concat/environment.yml b/modules/nf-core/csvtk/concat/environment.yml new file mode 100644 index 0000000..ed1ba26 --- /dev/null +++ b/modules/nf-core/csvtk/concat/environment.yml @@ -0,0 +1,7 @@ +name: csvtk_concat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::csvtk=0.23.0 diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf new file mode 100644 index 0000000..16e59f6 --- /dev/null +++ b/modules/nf-core/csvtk/concat/main.nf @@ -0,0 +1,43 @@ +process CSVTK_CONCAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : + 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + val in_format + val out_format + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + csvtk \\ + concat \\ + $args \\ + --num-cpus $task.cpus \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/concat/meta.yml b/modules/nf-core/csvtk/concat/meta.yml new file mode 100644 index 0000000..5f53229 --- /dev/null +++ b/modules/nf-core/csvtk/concat/meta.yml @@ -0,0 +1,49 @@ +name: csvtk_concat +description: Concatenate two or more CSV (or TSV) tables into a single table +keywords: + - concatenate + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "version.yml" + - csv: + type: file + description: Concatenated CSV/TSV file + pattern: "*.{csv,tsv}" +authors: + - "@rpetit3" +maintainers: + - "@rpetit3" diff --git a/nextflow.config b/nextflow.config index 11eef6e..6c195f4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,32 +13,34 @@ params { uniprot_query = false // Ortholog options - use_all = true + use_all = false local_databases = false - use_oma = true + skip_oma = false oma_path = null oma_uniprot_path = null oma_ensembl_path = null oma_refseq_path = null - use_panther = true + skip_panther = false panther_path = null - use_inspector = true - inspector_path = null - inspector_version = 'Eukaryota2023' - use_eggnog = true + skip_orthoinspector = false + orthoinspector_path = null + orthoinspector_version = 'Eukaryota2023' + skip_eggnog = false eggnog_path = null eggnog_idmap_path = null use_centroid = false min_score = 2 + skip_orthoplots = false // Downstream analysis options skip_downstream = false skip_report = false use_structures = false - use_iqtree = true - use_fastme = false - iqtree_bootstrap = 100 + skip_iqtree = false + skip_fastme = false + iqtree_bootstrap = 1000 fastme_bootstrap = 100 + skip_treeplots = false // Boilerplate options outdir = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 4f183c9..2fb4c2f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,7 +52,7 @@ "properties": { "use_all": { "type": "boolean", - "default": "true", + "default": "false", "description": "Use all ortholog search methods. Will mix online and local methods if needed. Overrides all individual database flags.", "help_text": "If set to `true`, the pipeline will use all ortholog search methods.", "fa_icon": "fas fa-database" @@ -64,11 +64,11 @@ "help_text": "If set to `true`, the pipeline will use local databases for the analysis.", "fa_icon": "fas fa-database" }, - "use_oma": { + "skip_oma": { "type": "boolean", - "default": "true", - "description": "Use OMA for the ortholog search.", - "help_text": "If set to `true`, the pipeline will use OMA for the ortholog search.", + "default": "false", + "description": "Skip using OMA for the ortholog search.", + "help_text": "If set to `true`, the pipeline will not use OMA for the ortholog search.", "fa_icon": "fas fa-database" }, "oma_path": { @@ -99,11 +99,11 @@ "help_text": "If `local_databases` is set to `true`, the pipeline will use this path to the OMA-RefSeq ID map.", "fa_icon": "fas fa-database" }, - "use_panther": { + "skip_panther": { "type": "boolean", - "default": "true", - "description": "Use PANTHER for the ortholog search.", - "help_text": "If set to `true`, the pipeline will use PANTHER for the ortholog search.", + "default": "false", + "description": "Skip using PANTHER for the ortholog search.", + "help_text": "If set to `true`, the pipeline will not use PANTHER for the ortholog search.", "fa_icon": "fas fa-database" }, "panther_path": { @@ -113,32 +113,32 @@ "help_text": "If `local_databases` is set to `true`, the pipeline will use this path to the PANTHER database.", "fa_icon": "fas fa-database" }, - "use_inspector": { + "skip_orthoinspector": { "type": "boolean", - "default": "true", - "description": "Use OrthoInspector for the ortholog search.", - "help_text": "If set to `true`, the pipeline will use OrthoInspector for the ortholog search.", + "default": "false", + "description": "Skip using OrthoInspector for the ortholog search.", + "help_text": "If set to `true`, the pipeline will not use OrthoInspector for the ortholog search.", "fa_icon": "fas fa-database" }, - "inspector_version": { + "orthoinspector_version": { "type": "string", "description": "The version of the OrthoInspector database to use.", "help_text": "This SHOULD be left as the default if working with eukaryotes. Only change if working with bacteria, or an old version is required for reproducibility.", "default": "Eukaryota2023", "fa_icon": "fas fa-database" }, - "inspector_path": { + "orthoinspector_path": { "type": "string", "format": "path", "description": "Path to the OrthoInspector database.", "help_text": "If `local_databases` is set to `true`, the pipeline will use this path to the OrthoInspector database.", "fa_icon": "fas fa-database" }, - "use_eggnog": { + "skip_eggnog": { "type": "boolean", - "default": "true", + "default": "false", "description": "Use EggNOG for the ortholog search.", - "help_text": "If set to `true`, the pipeline will use EggNOG for the ortholog search.", + "help_text": "If set to `true`, the pipeline will not use EggNOG for the ortholog search.", "fa_icon": "fas fa-database" }, "eggnog_path": { @@ -168,6 +168,13 @@ "description": "Minimum score for the ortholog search.", "help_text": "The minimum score for the ortholog search. If `use_centroid` is set to `true`, this parameter will be ignored.", "fa_icon": "fas fa-database" + }, + "skip_orthoplots": { + "type": "boolean", + "default": "false", + "description": "Skip the ortholog plots.", + "help_text": "If set to `true`, the pipeline will skip the ortholog plots.", + "fa_icon": "fas fa-database" } } }, @@ -198,23 +205,23 @@ "help_text": "If set to `true`, the pipeline will use AlphaFold structures for the analysis.", "fa_icon": "fas fa-dna" }, - "use_iqtree": { + "skip_iqtree": { "type": "boolean", - "default": "true", - "description": "Use IQ-TREE for the phylogenetic analysis.", - "help_text": "If set to `true`, the pipeline will use IQ-TREE for the phylogenetic analysis.", + "default": "false", + "description": "Skip using IQ-TREE for the phylogenetic analysis.", + "help_text": "If set to `true`, the pipeline will not use IQ-TREE for the phylogenetic analysis.", "fa_icon": "fas fa-tree" }, - "use_fastme": { + "skip_fastme": { "type": "boolean", "default": "false", - "description": "Use FastME for the phylogenetic analysis.", - "help_text": "If set to `true`, the pipeline will use FastME for the phylogenetic analysis.", + "description": "Skip using FastME for the phylogenetic analysis.", + "help_text": "If set to `true`, the pipeline will not use FastME for the phylogenetic analysis.", "fa_icon": "fas fa-tree" }, "iqtree_bootstrap": { "type": "integer", - "default": 100, + "default": 1000, "description": "Number of bootstrap replicates for IQ-TREE.", "help_text": "If set to `0`, bootstrap will not be performed.", "fa_icon": "fas fa-rotate" @@ -225,6 +232,13 @@ "description": "Number of bootstrap replicates for FastME.", "help_text": "If set to `0`, bootstrap will not be performed.", "fa_icon": "fas fa-rotate" + }, + "skip_treeplots": { + "type": "boolean", + "default": "false", + "description": "Skip the tree plots.", + "help_text": "If set to `true`, the pipeline will skip the tree plots.", + "fa_icon": "fas fa-tree" } } }, diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index 183f75a..4db3d00 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -14,6 +14,8 @@ include { MAKE_SCORE_TABLE } from "../../modules/local/make_score_ta include { FILTER_HITS } from "../../modules/local/filter_hits" include { PLOT_ORTHOLOGS } from "../../modules/local/plot_orthologs" include { MAKE_STATS } from "../../modules/local/make_stats" +include { STATS2CSV } from "../../modules/local/stats2csv" +include { CSVTK_CONCAT as MERGE_STATS } from "../../modules/nf-core/csvtk/concat/main" workflow GET_ORTHOLOGS { take: @@ -119,7 +121,7 @@ workflow GET_ORTHOLOGS { // OrthoInspector FETCH_INSPECTOR_GROUP_ONLINE ( ch_query, - params.inspector_version + params.orthoinspector_version ) ch_orthogroups @@ -147,7 +149,7 @@ workflow GET_ORTHOLOGS { } else { // online/local separation is used // local only if (params.local_databases) { - if (params.use_oma) { + if (!params.skip_oma) { FETCH_OMA_GROUP_LOCAL ( ch_query, params.oma_path, @@ -165,7 +167,7 @@ workflow GET_ORTHOLOGS { .set { ch_versions } } - if (params.use_panther) { + if (!params.skip_panther) { FETCH_PANTHER_GROUP_LOCAL ( ch_query, params.panther_path @@ -180,7 +182,7 @@ workflow GET_ORTHOLOGS { .set { ch_versions } } - if(params.use_eggnog) { + if(!params.skip_eggnog) { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, params.eggnog_path, @@ -198,7 +200,7 @@ workflow GET_ORTHOLOGS { } } else { // online only - if (params.use_oma) { + if (!params.skip_oma) { FETCH_OMA_GROUP_ONLINE ( ch_query ) @@ -212,7 +214,7 @@ workflow GET_ORTHOLOGS { .set { ch_versions } } - if (params.use_panther) { + if (!params.skip_panther) { FETCH_PANTHER_GROUP_ONLINE ( ch_query ) @@ -225,10 +227,10 @@ workflow GET_ORTHOLOGS { .mix(FETCH_PANTHER_GROUP_ONLINE.out.versions) .set { ch_versions } } - if (params.use_inspector) { + if (!params.skip_orthoinspector) { FETCH_INSPECTOR_GROUP_ONLINE ( ch_query, - params.inspector_version + params.orthoinspector_version ) ch_orthogroups @@ -242,6 +244,8 @@ workflow GET_ORTHOLOGS { } } + // Result merging + MERGE_CSV ( ch_orthogroups.groupTuple() ) @@ -250,6 +254,8 @@ workflow GET_ORTHOLOGS { .mix(MERGE_CSV.out.versions) .set { ch_versions } + // Scoring and filtering + MAKE_SCORE_TABLE ( MERGE_CSV.out.csv ) @@ -272,20 +278,56 @@ workflow GET_ORTHOLOGS { .mix(FILTER_HITS.out.versions) .set { ch_versions } - PLOT_ORTHOLOGS ( + // Plotting + + ch_supportsplot = ch_query.map { [it[0], []]} + ch_vennplot = ch_query.map { [it[0], []]} + ch_jaccardplot = ch_query.map { [it[0], []]} + + if(!params.skip_orthoplots) { + PLOT_ORTHOLOGS ( + MAKE_SCORE_TABLE.out.score_table + ) + + ch_supportsplot = PLOT_ORTHOLOGS.out.supports + ch_vennplot = PLOT_ORTHOLOGS.out.venn + ch_jaccardplot = PLOT_ORTHOLOGS.out.jaccard + + ch_versions + .mix(PLOT_ORTHOLOGS.out.versions) + .set { ch_versions } + } + + // Stats + + MAKE_STATS( MAKE_SCORE_TABLE.out.score_table ) ch_versions - .mix(PLOT_ORTHOLOGS.out.versions) + .mix(MAKE_STATS.out.versions) .set { ch_versions } - MAKE_STATS( - MAKE_SCORE_TABLE.out.score_table + STATS2CSV( + MAKE_STATS.out.stats ) ch_versions - .mix(MAKE_STATS.out.versions) + .mix(STATS2CSV.out.versions) + .set { ch_versions } + + ch_stats = STATS2CSV.out.csv + .collect { it[1] } + .map { [[id: "all"], it] } + + MERGE_STATS( + ch_stats, + "csv", + "csv" + ) + + ch_versions + .mix(MERGE_STATS.out.versions) .set { ch_versions } ch_versions @@ -293,17 +335,18 @@ workflow GET_ORTHOLOGS { .set { ch_merged_versions } emit: - seqinfo = ch_query - id = ch_query.map { it[1] } - taxid = ch_query.map { it[2] } - exact = ch_query.map { it[3] } - orthogroups = ch_orthogroups - score_table = MAKE_SCORE_TABLE.out.score_table - orthologs = FILTER_HITS.out.filtered_hits - supports_plot = PLOT_ORTHOLOGS.out.supports - venn_plot = PLOT_ORTHOLOGS.out.venn - jaccard_plot = PLOT_ORTHOLOGS.out.jaccard - stats = MAKE_STATS.out.stats - versions = ch_merged_versions + seqinfo = ch_query + id = ch_query.map { it[1] } + taxid = ch_query.map { it[2] } + exact = ch_query.map { it[3] } + orthogroups = ch_orthogroups + score_table = MAKE_SCORE_TABLE.out.score_table + orthologs = FILTER_HITS.out.filtered_hits + supports_plot = ch_supportsplot + venn_plot = ch_vennplot + jaccard_plot = ch_jaccardplot + stats = MAKE_STATS.out.stats + aggregated_stats = MERGE_STATS.out.csv + versions = ch_merged_versions } diff --git a/subworkflows/local/make_trees.nf b/subworkflows/local/make_trees.nf index 34b75c4..b4743a0 100644 --- a/subworkflows/local/make_trees.nf +++ b/subworkflows/local/make_trees.nf @@ -16,7 +16,7 @@ workflow MAKE_TREES { ch_mlplot = Channel.empty() ch_meplot = Channel.empty() - if (params.use_iqtree) { + if (!params.skip_iqtree) { IQTREE ( ch_alignment, [] @@ -28,19 +28,23 @@ workflow MAKE_TREES { .mix(IQTREE.out.versions) .set { ch_versions } - PLOT_IQTREE ( - IQTREE.out.phylogeny, - "iqtree" - ) + ch_mlplot = ch_alignment.map { [it[0], []] } - ch_mlplot = PLOT_IQTREE.out.plot + if(!params.skip_treeplots) { + PLOT_IQTREE ( + IQTREE.out.phylogeny, + "iqtree" + ) - ch_versions - .mix(PLOT_IQTREE.out.versions) - .set { ch_versions } + ch_mlplot = PLOT_IQTREE.out.plot + + ch_versions + .mix(PLOT_IQTREE.out.versions) + .set { ch_versions } + } } - if (params.use_fastme) { + if (!params.skip_fastme) { CONVERT_PHYLIP ( ch_alignment @@ -60,16 +64,20 @@ workflow MAKE_TREES { .mix(FASTME.out.versions) .set { ch_versions } - PLOT_FASTME ( - FASTME.out.nwk, - "fastme" - ) + ch_meplot = ch_alignment.map { [it[0], []] } - ch_meplot = PLOT_FASTME.out.plot + if(!params.skip_treeplots) { + PLOT_FASTME ( + FASTME.out.nwk, + "fastme" + ) - ch_versions - .mix(PLOT_FASTME.out.versions) - .set { ch_versions } + ch_meplot = PLOT_FASTME.out.plot + + ch_versions + .mix(PLOT_FASTME.out.versions) + .set { ch_versions } + } } emit: diff --git a/subworkflows/local/report.nf b/subworkflows/local/report.nf index 9dadae5..a1ea745 100644 --- a/subworkflows/local/report.nf +++ b/subworkflows/local/report.nf @@ -20,17 +20,41 @@ workflow REPORT { ch_fastme main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() + ch_fasta = ch_seqinfo.map { [it[0], []] } + + if(params.skip_downstream) { + ch_seqhits = ch_seqinfo.map { [it[0], []] } + ch_seqmisses = ch_seqinfo.map { [it[0], []] } + ch_strhits = ch_seqinfo.map { [it[0], []] } + ch_strmisses = ch_seqinfo.map { [it[0], []] } + ch_alignment = ch_seqinfo.map { [it[0], []] } + } + else if(!params.use_structures) { + ch_strhits = ch_seqinfo.map { [it[0], []] } + ch_strmisses = ch_seqinfo.map { [it[0], []] } + } + + if (params.skip_iqtree) { + ch_iqtree = ch_seqinfo.map { [it[0], []] } + } + if (params.skip_fastme) { + ch_fastme = ch_seqinfo.map { [it[0], []] } + } DUMP_PARAMS( ch_seqinfo.map { [it[0], it[3]] } ) - CONVERT_FASTA(ch_alignment) + if(!params.skip_downstream) { + CONVERT_FASTA(ch_alignment) - ch_versions - .mix(CONVERT_FASTA.out.versions) - .set { ch_versions } + ch_fasta = CONVERT_FASTA.out.fasta + + ch_versions + .mix(CONVERT_FASTA.out.versions) + .set { ch_versions } + } ch_forreport = ch_seqinfo .join(ch_scoretable, by:0) @@ -43,7 +67,7 @@ workflow REPORT { .join(ch_seqmisses, by:0) .join(ch_strhits, by:0) .join(ch_strmisses, by:0) - .join(CONVERT_FASTA.out.fasta, by:0) + .join(ch_fasta, by:0) .join(ch_iqtree, by:0) .join(ch_fastme, by:0) .join(DUMP_PARAMS.out.params, by:0)