gyorilab · bgyori · Apr 20, 2022 · Apr 20, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/benchmarks/fplx_evaluation.py b/benchmarks/fplx_evaluation.py
@@ -48,6 +48,8 @@
                       'integrin alpha': {'FPLX': 'ITGA'},
                       'DC': {'MESH': 'D003713'},
                       'BMD': {'MESH': 'D015519'}}
+                      'PTPMeg2': {'HGNC': '9661'},
+                      'alpha4': {'HGNC': '5461'}}
 
 
 incorrect_assertions = {'IGF': {'HGNC': '5464'},

diff --git a/gilda/__init__.py b/gilda/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.11.1'
+__version__ = '0.12.0'
 
 import logging
 

diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py
@@ -172,7 +172,7 @@ def generate_chebi_terms():
                         row['COMPOUND_ID'])
             continue
         db = 'CHEBI'
-        name = str(row['NAME'])
+        name = str(row['NAME']).strip()
         chebi_name = \
             chebi_client.get_chebi_name_from_id(chebi_id, offline=True)
         if chebi_name is None:
@@ -593,6 +593,8 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
         if doid_name:
             db, db_id, name = 'DOID', doid, doid_name
 
+    name = name.strip()
+
     # Add a term for the name first
     name_term = Term(
         norm_text=normalize(name),
@@ -624,6 +626,8 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
         if match:
             synonym = match.groups()[0]
 
+        synonym = synonym.strip()
+
         synonym_term = Term(
             norm_text=normalize(synonym),
             text=synonym,
@@ -653,6 +657,31 @@ def _generate_obo_terms(prefix, ignore_mappings=False, map_to_ns=None):
     return terms
 
 
+def generate_entrez_terms():
+    import pandas as pd
+    df = pd.read_csv('https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/'
+                     'Homo_sapiens.gene_info.gz', sep='\t',
+                     keep_default_na=False, na_values=['_'])
+    terms = []
+    for _, row in df.iterrows():
+        entrez_id = str(row['GeneID'])
+        hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id)
+        if not hgnc_id:
+            continue
+        hgnc_symbol = hgnc_client.get_hgnc_name(hgnc_id)
+        synonyms = row['Synonyms'].split('|') if row['Synonyms'] != '-' else []
+        other_designations = row['Other_designations'].split('|') \
+            if row['Other_designations'] != '-' else []
+        for syn in synonyms + other_designations:
+            if syn.startswith('(') and syn.endswith(')'):
+                continue
+            terms.append(
+                Term(normalize(syn), syn, 'HGNC', hgnc_id, hgnc_symbol,
+                     'synonym', 'entrez', '9606', 'EGID', entrez_id)
+            )
+    return terms
+
+
 def _make_mesh_mappings():
     # Load MeSH ID/label mappings
     from .resources import MESH_MAPPINGS_PATH
@@ -689,6 +718,7 @@ def get_all_terms():
         generate_uniprot_terms(),
         generate_famplex_terms(),
         generate_hgnc_terms(),
+        generate_entrez_terms(),
         generate_chebi_terms(),
         generate_go_terms(),
         generate_mesh_terms(),

diff --git a/gilda/term.py b/gilda/term.py
@@ -47,7 +47,7 @@ class Term(object):
 
     def __init__(self, norm_text, text, db, id, entry_name, status, source,
                  organism=None, source_db=None, source_id=None):
-        if not text:
+        if not text or not isinstance(text, str):
             raise ValueError('Text for Term cannot be empty')
         self.norm_text = norm_text
         self.text = text

diff --git a/gilda/tests/test_api.py b/gilda/tests/test_api.py
@@ -41,7 +41,7 @@ def test_organisms():
     assert matches3[0].term.id == 'P63163'
     # Here we use SMN again but prioritize human and get three bad groundings
     matches4 = ground('SMN', organisms=['9606', '10090'])
-    assert len(matches4) == 2, matches4
+    assert len(matches4) == 3, matches4
     assert all(m.term.organism == '9606' for m in matches4)
     # Finally we try grounding SMN1 with mouse prioritized, don't find a match
     # and end up with the human gene grounding

diff --git a/gilda/tests/test_grounder.py b/gilda/tests/test_grounder.py
@@ -15,13 +15,13 @@ def test_grounder():
             assert entry.id == '6407', entry
 
     scores = gr.ground('kras')
-    assert len(scores) == 1, scores
+    assert len(scores) == 2, scores
     assert appreq(scores[0].score, 0.9845), scores
     scores = gr.ground('k-ras')
-    assert len(scores) == 1, scores
+    assert len(scores) == 2, scores
     assert appreq(scores[0].score, 0.9936), scores
     scores = gr.ground('KRAS')
-    assert len(scores) == 1, scores
+    assert len(scores) == 2, scores
     assert appreq(scores[0].score, 1.0), scores
     scores = gr.ground('bRaf')
     assert len(scores) == 1, scores
@@ -38,7 +38,7 @@ def test_grounder_num_entries():
     entries = gr.lookup('NPM1')
     assert len(entries) == 4, entries
     entries = gr.lookup('H4')
-    assert len(entries) == 7, entries
+    assert len(entries) == 9, entries
 
 
 def test_grounder_depluralize():