Thanks to visit codestin.com
Credit goes to github.com

Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 19f0d27

Browse files
committed
fix errors in deser, vis, and RDF extract
1 parent 7f93c69 commit 19f0d27

8 files changed

Lines changed: 371 additions & 322 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
*~
33
chromedriver
44
lemma.json
5+
lemma.rdf
56
examples/tmp.*.html
7+
paper.md
68
vis.html
79
gor.html
810
txg.tgz

NOTES.md

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
11
TODO:
22

33
* load RDF, to bootstrap/iterate analysis
4+
- use `skos:broader` for structural represenation of synonyms
45

56
* download ZIP: KuzuDB node-link
6-
https://neo4j.com/docs/getting-started/data-import/csv-import/
7+
- https://neo4j.com/docs/getting-started/data-import/csv-import/
78

8-
* extend GOR to replicate NodePiece/ULTRA ?
9+
* link entities for lemmas, noun chunks using MediaWiki lookups?
10+
- apply default semantics: `skos:related`
911

1012

11-
* reify GOR, then use FastRP to generate embeddings?
12-
https://github.com/Knorreman/fastRP
13+
* extend GOR to replicate NodePiece/ULTRA ?
1314

15+
* reify GOR, then use FastRP to generate embeddings?
16+
- https://github.com/Knorreman/fastRP
1417

15-
* link entities for lemmas, noun chunks using MediaWiki lookups?
16-
* default semantics: `skos:related`
1718

1819
* also eval community detection to condense nodes using k-medoids?
19-
https://medium.com/neo4j/clustering-graph-data-with-k-medoids-3b6a67ea0873
20+
- https://medium.com/neo4j/clustering-graph-data-with-k-medoids-3b6a67ea0873
2021

2122
* add conda packaging
22-
https://conda.github.io/grayskull/
23+
- https://conda.github.io/grayskull/
2324

2425

2526
* SPARQL the DBPedia/Wikidata equivs
2627

27-
* are multiple relations missing in the lemma graph?
28-
2928
* check out https://github.com/wikipedia2vec/wikipedia2vec
3029

3130
* link `sense2vec` synonyms; make affordances for UI to annotate synonyms

examples/ex0_0.ipynb

Lines changed: 267 additions & 257 deletions
Large diffs are not rendered by default.

tests/test_load.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222

2323
def test_load_minimal (
24+
*,
25+
debug: bool = False,
2426
) -> None:
2527
"""
2628
Construct a _lemma graph_ from a minimal example, then compare
@@ -49,14 +51,20 @@ def test_load_minimal (
4951

5052
obs_graph: dict = json.loads(tg.dump_lemma_graph())
5153

54+
if debug:
55+
print(obs_graph)
56+
5257
# compare
5358
diff: deepdiff.diff.DeepDiff = deepdiff.DeepDiff(exp_graph, obs_graph)
5459

60+
if debug:
61+
print(diff)
62+
5563
if len(diff) > 0:
5664
print(json.dumps(json.loads(diff.to_json()), indent = 2))
5765

5866
assert len(diff) == 0
5967

6068

6169
if __name__ == "__main__":
62-
test_load_minimal()
70+
test_load_minimal(debug = True)

textgraphs/doc.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -261,20 +261,20 @@ def _make_class_link (
261261

262262
else:
263263
# find class IRI metadata
264-
class_meta: typing.List[ str ] = [
265-
meta["definition"]
264+
class_meta: typing.List[typing.Dict[ str, str ]] = [
265+
meta
266266
for meta in pipe.kg.NER_MAP.values()
267267
if meta["iri"] == node.label
268268
]
269269

270270
dst = Node(
271271
len(self.nodes),
272272
node.label, # type: ignore
273-
class_meta[0],
273+
class_meta[0]["definition"],
274274
str(rdflib.RDF.type),
275275
NodeEnum.IRI,
276-
label = node.label,
277-
length = node.length,
276+
label = class_meta[0]["label"],
277+
length = len(class_meta[0]["label"].split(" ")),
278278
count = 1,
279279
)
280280

@@ -1001,37 +1001,42 @@ def extract_rdf ( # pylint: disable=R0914
10011001
# extract entities as RDF
10021002
for node_id, node in enumerate(self.nodes.values()):
10031003
if node.kind in [ NodeEnum.ENT, NodeEnum.LEM ]:
1004-
iri: str = f"{self.iri_base}entity/{node.key.lower().replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301
1005-
subj: rdflib.URIRef = rdflib.URIRef(iri)
1006-
ref_dict[node_id] = subj
1007-
1008-
rdf_graph.add((
1009-
subj,
1010-
rdflib.SKOS.prefLabel,
1011-
rdflib.Literal(node.text, lang = lang),
1012-
))
1013-
1014-
if node.kind == NodeEnum.ENT and node.annotated:
1015-
cls_obj: rdflib.URIRef = rdflib.URIRef(node.label)
1016-
cls_id: int = node_keys.index(node.label) # type: ignore
1017-
ref_dict[cls_id] = cls_obj
1004+
if node.pos not in [ "VERB" ]:
1005+
iri: str = f"{self.iri_base}entity/{node.key.lower().replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301
1006+
subj: rdflib.URIRef = rdflib.URIRef(iri)
1007+
ref_dict[node_id] = subj
10181008

10191009
rdf_graph.add((
10201010
subj,
1021-
rdflib.RDF.type,
1022-
cls_obj,
1011+
rdflib.SKOS.prefLabel,
1012+
rdflib.Literal(node.text, lang = lang),
10231013
))
10241014

1015+
if node.kind == NodeEnum.ENT and node.annotated:
1016+
cls_obj: rdflib.URIRef = rdflib.URIRef(node.label)
1017+
cls_id: int = node_keys.index(node.label) # type: ignore
1018+
ref_dict[cls_id] = cls_obj
1019+
1020+
rdf_graph.add((
1021+
subj,
1022+
rdflib.RDF.type,
1023+
cls_obj,
1024+
))
1025+
10251026
elif node.kind == NodeEnum.IRI:
10261027
subj = rdflib.URIRef(node.key)
10271028
ref_dict[node_id] = subj
10281029

1029-
desc = rdflib.Literal(node.text, lang = "en")
1030-
10311030
rdf_graph.add((
10321031
subj,
10331032
rdflib.SKOS.prefLabel,
1034-
desc,
1033+
rdflib.Literal(node.label, lang = lang),
1034+
))
1035+
1036+
rdf_graph.add((
1037+
subj,
1038+
rdflib.SKOS.definition,
1039+
rdflib.Literal(node.text, lang = lang),
10351040
))
10361041

10371042
# extract relations as RDF

textgraphs/graph.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,21 +119,21 @@ def make_node ( # pylint: disable=R0913,R0914
119119

120120
if not linked:
121121
# construct a placeholder node (stopwords)
122+
# NB: omit locations
122123
self.nodes[key] = Node(
123124
len(self.nodes),
124125
key,
125126
span.text,
126127
span.pos_,
127128
kind,
128129
span = span,
129-
loc = [ location ],
130130
length = length,
131131
)
132132

133133
elif key in self.nodes:
134134
# link to previously constructed entity node
135-
self.nodes[key].loc.append(location)
136135
self.nodes[key].count += 1
136+
self.nodes[key].loc.append(location)
137137

138138
# construct a new node for entity or lemma
139139
else:
@@ -247,14 +247,21 @@ def dump_lemma_graph (
247247
nx_node = self.lemma_graph.nodes[node.node_id]
248248
nx_node["name"] = node.text
249249
nx_node["kind"] = str(node.kind)
250-
nx_node["iri"] = node.label
251250
nx_node["subobj"] = node.sub_obj
252251
nx_node["pos"] = node.pos
253252
nx_node["loc"] = str(node.loc)
254253
nx_node["length"] = node.length
255254
nx_node["hood"] = node.neighbors
256255
nx_node["anno"] = node.annotated
257256

257+
# juggle the serialized IRIs
258+
if node.kind in [ NodeEnum.IRI ]:
259+
nx_node["iri"] = node.key
260+
elif node.label is not None and node.label.startswith("http"):
261+
nx_node["iri"] = node.label
262+
else:
263+
nx_node["iri"] = None
264+
258265
# emulate a node-link format serialization, using the
259266
# default `NetworkX.node_link_data()` property names
260267
edge_list: typing.List[ dict ] = []
@@ -307,15 +314,10 @@ def load_lemma_graph ( # pylint: disable=R0914
307314
if debug:
308315
ic(nx_node)
309316

310-
label: typing.Optional[ str ] = None
311317
kind: NodeEnum = NodeEnum.decode(nx_node["kind"]) # type: ignore
318+
label: typing.Optional[ str ] = nx_node["label"]
312319

313-
if kind in [ NodeEnum.ENT ]:
314-
if nx_node["iri"] is not None:
315-
label = nx_node["iri"]
316-
else:
317-
label = nx_node["label"]
318-
elif kind in [ NodeEnum.IRI ]:
320+
if kind in [ NodeEnum.ENT ] and nx_node["iri"] is not None:
319321
label = nx_node["iri"]
320322

321323
node: Node = self.make_node(

textgraphs/kg.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,75 +52,93 @@ class KGWikiMedia (KnowledgeGraph): # pylint: disable=R0902,R0903
5252
NER_MAP: typing.Dict[ str, dict ] = OrderedDict({
5353
"CARDINAL": {
5454
"iri": "http://dbpedia.org/resource/Cardinal_number",
55-
"definition": "Numerals that do not fall under another type"
55+
"definition": "Numerals that do not fall under another type",
56+
"label": "cardinal number",
5657
},
5758
"DATE": {
5859
"iri": "http://dbpedia.org/ontology/date",
59-
"definition": "Absolute or relative dates or periods"
60+
"definition": "Absolute or relative dates or periods",
61+
"label": "date",
6062
},
6163
"EVENT": {
6264
"iri": "http://dbpedia.org/ontology/Event",
63-
"definition": "Named hurricanes, battles, wars, sports events, etc."
65+
"definition": "Named hurricanes, battles, wars, sports events, etc.",
66+
"label": "event",
6467
},
6568
"FAC": {
6669
"iri": "http://dbpedia.org/ontology/Infrastructure",
67-
"definition": "Buildings, airports, highways, bridges, etc."
70+
"definition": "Buildings, airports, highways, bridges, etc.",
71+
"label": "infrastructure",
6872
},
6973
"GPE": {
7074
"iri": "http://dbpedia.org/ontology/Country",
71-
"definition": "Countries, cities, states"
75+
"definition": "Countries, cities, states",
76+
"label": "country",
7277
},
7378
"LANGUAGE": {
7479
"iri": "http://dbpedia.org/ontology/Language",
75-
"definition": "Any named language"
80+
"definition": "Any named language",
81+
"label": "language",
7682
},
7783
"LAW": {
7884
"iri": "http://dbpedia.org/ontology/Law",
79-
"definition": "Named documents made into laws "
85+
"definition": "Named documents made into laws",
86+
"label": "law",
8087
},
8188
"LOC": {
8289
"iri": "http://dbpedia.org/ontology/Place",
83-
"definition": "Non-GPE locations, mountain ranges, bodies of water"
90+
"definition": "Non-GPE locations, mountain ranges, bodies of water",
91+
"label": "place",
8492
},
8593
"MONEY": {
8694
"iri": "http://dbpedia.org/resource/Money",
87-
"definition": "Monetary values, including unit"
95+
"definition": "Monetary values, including unit",
96+
"label": "money",
8897
},
8998
"NORP": {
9099
"iri": "http://dbpedia.org/ontology/nationality",
91-
"definition": "Nationalities or religious or political groups"
100+
"definition": "Nationalities or religious or political groups",
101+
"label": "nationality",
92102
},
93103
"ORDINAL": {
94104
"iri": "http://dbpedia.org/resource/Ordinal_number",
95-
"definition": "Ordinal number, i.e., first, second, etc."
105+
"definition": "Ordinal number, i.e., first, second, etc.",
106+
"label": "ordinal number",
96107
},
97108
"ORG": {
98109
"iri": "http://dbpedia.org/ontology/Organisation",
99-
"definition": "Companies, agencies, institutions, etc."
110+
"definition": "Companies, agencies, institutions, etc.",
111+
"label": "organization",
100112
},
101113
"PERCENT": {
102114
"iri": "http://dbpedia.org/resource/Percentage",
103-
"definition": "Percentage"
115+
"definition": "Percentage",
116+
"label": "percentage",
104117
},
105118
"PERSON": {
106119
"iri": "http://dbpedia.org/ontology/Person",
107-
"definition": "People, including fictional"
120+
"definition": "People, including fictional",
121+
"label": "person",
108122
},
109123
"PRODUCT": {
110124
"iri": "http://dbpedia.org/ontology/product",
111-
"definition": "Vehicles, weapons, foods, etc. (Not services)"
125+
"definition": "Vehicles, weapons, foods, etc. (Not services)",
126+
"label": "product",
112127
},
113128
"QUANTITY": {
114129
"iri": "http://dbpedia.org/resource/Quantity",
115-
"definition": "Measurements, as of weight or distance"
130+
"definition": "Measurements, as of weight or distance",
131+
"label": "quantity",
116132
},
117133
"TIME": {
118134
"iri": "http://dbpedia.org/ontology/time",
119-
"definition": "Times smaller than a day"
135+
"definition": "Times smaller than a day",
136+
"label": "time",
120137
},
121138
"WORK OF ART": {
122139
"iri": "http://dbpedia.org/resource/Work_of_art",
123-
"definition": "Titles of books, songs, etc."
140+
"definition": "Titles of books, songs, etc.",
141+
"label": "work of art",
124142
},
125143
})
126144

@@ -252,7 +270,7 @@ def normalize_prefix (
252270
debug: bool = False,
253271
) -> str:
254272
"""
255-
Normalize the given IRI to use the standard DBPedia namespace prefixes.
273+
Normalize the given IRI using the standard DBPedia namespace prefixes.
256274
257275
iri:
258276
input IRI, in fully-qualified domain representation
@@ -1003,7 +1021,7 @@ def _make_link (
10031021
rel,
10041022
NodeEnum.IRI,
10051023
span = link.span,
1006-
label = link.iri,
1024+
label = link.kg_ent.label, # type: ignore
10071025
length = link.length,
10081026
count = 1,
10091027
)

textgraphs/vis.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def render_lemma_graph (
112112
nx_node["shape"] = NODE_STYLES[node.kind].shape
113113
nx_node["color"] = NODE_STYLES[node.kind].color
114114

115+
nx_node["kind"] = str(node.kind)
115116
nx_node["size"] = nx_node["count"]
116117

117118
if node.kind in [ NodeEnum.IRI ]:
@@ -137,6 +138,10 @@ def render_lemma_graph (
137138
pv_graph: pyvis.network.Network = pyvis.network.Network()
138139
pv_graph.from_nx(self.graph.lemma_graph)
139140

141+
for pv_node in pv_graph.nodes:
142+
if pv_node["kind"] == str(NodeEnum.IRI):
143+
pv_node["label"] = self.kg.normalize_prefix(pv_node["lemma"])
144+
140145
for pv_edge in pv_graph.get_edges():
141146
edge_key = ( pv_edge["from"], pv_edge["to"], )
142147
edge_info = edge_labels.get(edge_key)

0 commit comments

Comments
 (0)