From 3ec4a534f11d8445dfd7ab703f6b7022e9dea7ad Mon Sep 17 00:00:00 2001 From: jcw349 Date: Fri, 18 Jul 2025 17:45:55 -0700 Subject: [PATCH 1/2] feat(get_json_entry): empty clade names will inherit from parent node feat(write_json_from_mat): extension: nextclade added to "meta" can now export multiple levels of annotations --- src/matUtils/convert.cpp | 109 +++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/src/matUtils/convert.cpp b/src/matUtils/convert.cpp index e4df3ba7d..1ba1f9a4f 100644 --- a/src/matUtils/convert.cpp +++ b/src/matUtils/convert.cpp @@ -507,7 +507,7 @@ MAT::Tree load_mat_from_json(std::string json_filename) { return T; } -json get_json_entry(MAT::Node* n, std::vector>>* catmeta, size_t div = 0, bool use_clade_zero = false, bool use_clade_one = false) { +json get_json_entry(MAT::Node* n, std::vector>>* catmeta, size_t div = 0, std::vector use_clade_n = {false}, std::vector> parent_cav = {} ) { //each node has 3 constituent attributes //node_attrs, branch_attrs, and children. If its a leaf, //it also has a simple name attribute. @@ -532,35 +532,36 @@ json get_json_entry(MAT::Node* n, std::vectorclade_annotations.size() >= 1) { - c1av = n->clade_annotations[0]; - } - if (n->clade_annotations.size() >= 2) { - //json output supports two simultaneous clade annotations - //being nextstrain and pangolin, since this is very specific - //to sars-cov-2 phylogenomics. Additional fields are ignored - //at this point. - c1av = n->clade_annotations[1]; - } - std::unordered_map c1a {{"value",c1av}}; - std::unordered_map c2a {{"value",c2av}}; + + int annot_size = n->clade_annotations.size(); + std::vector> cav(annot_size); + for ( int c=0; c < annot_size; c++) { + + if (use_clade_n[c]) { + + if (n->clade_annotations[c] == ""){ + cav[c] = parent_cav[c]; + } else { + cav[c] = {{"value",n->clade_annotations[c]}}; + } + } + } + std::string country = n->identifier.substr(0, n->identifier.find("/")); std::string date = n->identifier.substr( n->identifier.find_last_of("|")+1, n->identifier.size() ); std::unordered_map com {{"value",country}}; std::unordered_map dam {{"value",date}}; if ((n->is_leaf()) && (country.length() != n->identifier.size()) && (date.length() != n->identifier.size()) ) { - sj["node_attrs"] = { {"country",com}, {"date",dam},{"div", div}, {"MAT_Clade_0", c1a}, {"MAT_Clade_1", c2a} }; + sj["node_attrs"] = { {"country",com}, {"date",dam},{"div", div}}; } else { sj["node_attrs"]["div"] = div; - if (use_clade_zero) { - sj["node_attrs"]["MAT_Clade_0"] = c1a; - } - if (use_clade_one) { - sj["node_attrs"]["MAT_Clade_1"] = c2a; - } } + + for (int c=0; c < annot_size; c++){ + if (use_clade_n[c]) + sj["node_attrs"]["MAT_Clade_"+std::to_string(c)] = cav[c]; + } + for (const auto& cmet: *catmeta) { for (const auto& cmi: cmet) { if (cmi.second.find(n->identifier) != cmi.second.end()) { @@ -573,7 +574,7 @@ json get_json_entry(MAT::Node* n, std::vectoridentifier; std::vector child_json; for (auto cn: n->children) { - json cj = get_json_entry(cn, catmeta, div, use_clade_zero, use_clade_one); + json cj = get_json_entry(cn, catmeta, div, use_clade_n, cav ); child_json.push_back(cj); sj["children"] = child_json; } @@ -585,6 +586,26 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< json nj; std::string desc = "JSON generated by matUtils. If you have metadata you wish to display, you can now drag on a CSV/TSV file and it will be added into this view, [see here](https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html) for more info."; std::unordered_map lm = {{"branch_label", "none"}}; + int annot_size = T->root->clade_annotations.size(); + + // check clade annotations that should be written + std::vector use_clades(annot_size,{false}); + + for (auto n: T->depth_first_expansion()) { + int idx = 0; + for (const auto& c: n->clade_annotations) { + ++idx; + + if (!c.empty()) { + use_clades[idx] = true; + } + } + if (std::all_of(use_clades.begin(), use_clades.end(), [](bool b) { return b;})) { + break; + } + } + + // set Nextclade extension configurations nj = { {"version","v2"}, { @@ -593,8 +614,8 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< {"filters",json::array({"country","userOrOld"})}, {"panels",json::array({"tree"})}, {"colorings",{ {{"key","country"},{"title","Country"},{"type","categorical"}} }}, - {"display_defaults",lm}, - {"description",desc} + {"display_defaults", lm}, + {"description", desc} } }, { @@ -604,6 +625,7 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< } } }; + //add metadata to the header colorings if any exist if (catmeta->size()>0) { for (const auto& cmet: *catmeta) { @@ -619,33 +641,19 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< } } } - //check whether each of the mat clade annotation fields are used by any sample. - bool uses_clade_0 = false; - bool uses_clade_1 = false; - for (auto n: T->depth_first_expansion()) { - if (n->clade_annotations.size() >= 1) { - if (n->clade_annotations[0] != "") { - uses_clade_0 = true; - } - if (n->clade_annotations.size() >= 2) { - if (n->clade_annotations[1] != "") { - uses_clade_1 = true; - } - } + + int idx = 0; + for (const bool& c: use_clades) { + if (c) { + nj["meta"]["extensions"]["nextclade"]["clade_node_attrs"].push_back({{"name","MAT_Clade_" + std::to_string(idx)},{"displayName","MAT_Clade_"+ std::to_string(idx+1)},{"description","MAT_Clade_" + std::to_string(idx+1) + "as inferred or proposed by UShER, matUtils, or Autolin."},{"hideInWeb",false},{"skipAsReference",true}} ); + + std::unordered_map cmap {{"key","MAT_Clade_" + std::to_string(idx)},{"title","MAT_Clade_" + std::to_string(idx+1)},{"type","categorical"}}; + nj["meta"]["colorings"].push_back(cmap); } - if ((uses_clade_0) && (uses_clade_1)) { - break; - } - } - if (uses_clade_0) { - std::unordered_map c1map {{"key","MAT_Clade_0"},{"title","MAT_Clade_1"},{"type","categorical"}}; - nj["meta"]["colorings"].push_back(c1map); + ++idx; } - if (uses_clade_1) { - std::unordered_map c2map {{"key","MAT_Clade_1"},{"title","MAT_Clade_2"},{"type","categorical"}}; - nj["meta"]["colorings"].push_back(c2map); - } - auto treestuff = get_json_entry(T->root, catmeta, 0, uses_clade_0, uses_clade_1); + + auto treestuff = get_json_entry(T->root, catmeta, 0, use_clades); nj["tree"]["children"] = json::array({treestuff}); std::ofstream out(output_filename); // out << std::setw(4) << nj << std::endl; @@ -653,6 +661,7 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< out.close(); } + void get_minimum_subtrees(MAT::Tree* T, std::vector samples, size_t nearest_subtree_size, std::string output_dir, std::vector>>* catmeta, std::string json_n, std::string newick_n, bool retain_original_branch_len) { //get the minimum set of subtrees of the indicated size which cover all input samples //and write them to the indicated output directory, with the indicated prefix, along with a tsv indicating which trees contain the relevant samples. From 6d39dff5c4eb0e9d95ea2bf018bea7a7584a3fa4 Mon Sep 17 00:00:00 2001 From: jcw349 Date: Fri, 18 Jul 2025 17:52:48 -0700 Subject: [PATCH 2/2] style(write_json_from_mat): removed white spaces --- src/matUtils/convert.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/matUtils/convert.cpp b/src/matUtils/convert.cpp index 1ba1f9a4f..440f1d7b4 100644 --- a/src/matUtils/convert.cpp +++ b/src/matUtils/convert.cpp @@ -614,8 +614,8 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< {"filters",json::array({"country","userOrOld"})}, {"panels",json::array({"tree"})}, {"colorings",{ {{"key","country"},{"title","Country"},{"type","categorical"}} }}, - {"display_defaults", lm}, - {"description", desc} + {"display_defaults",lm}, + {"description",desc} } }, { @@ -661,7 +661,6 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< out.close(); } - void get_minimum_subtrees(MAT::Tree* T, std::vector samples, size_t nearest_subtree_size, std::string output_dir, std::vector>>* catmeta, std::string json_n, std::string newick_n, bool retain_original_branch_len) { //get the minimum set of subtrees of the indicated size which cover all input samples //and write them to the indicated output directory, with the indicated prefix, along with a tsv indicating which trees contain the relevant samples.