|
14 | 14 | see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md |
15 | 15 | """ |
16 | 16 |
|
| 17 | +from collections import defaultdict |
17 | 18 | import asyncio |
18 | 19 | import logging |
19 | 20 | import os |
| 21 | +import re |
20 | 22 | import sys |
21 | 23 | import typing |
22 | 24 |
|
@@ -1002,7 +1004,7 @@ def extract_rdf ( # pylint: disable=R0914 |
1002 | 1004 | for node_id, node in enumerate(self.nodes.values()): |
1003 | 1005 | if node.kind in [ NodeEnum.ENT, NodeEnum.LEM ]: |
1004 | 1006 | if node.pos not in [ "VERB" ]: |
1005 | | - iri: str = f"{self.iri_base}entity/{node.key.lower().replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301 |
| 1007 | + iri: str = f"{self.iri_base}entity/{node.key.replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301 |
1006 | 1008 | subj: rdflib.URIRef = rdflib.URIRef(iri) |
1007 | 1009 | ref_dict[node_id] = subj |
1008 | 1010 |
|
@@ -1072,3 +1074,125 @@ def extract_rdf ( # pylint: disable=R0914 |
1072 | 1074 | ) |
1073 | 1075 |
|
1074 | 1076 | return n3_str |
| 1077 | + |
| 1078 | + |
| 1079 | + def denormalize_iri ( |
| 1080 | + self, |
| 1081 | + uri_ref: rdflib.term.URIRef, |
| 1082 | + ) -> str: |
| 1083 | + """ |
| 1084 | +Discern between a parsed entity and a linked entity. |
| 1085 | +
|
| 1086 | + returns: |
| 1087 | +_lemma_key_ for a parsed entity, the full IRI for a linked entity |
| 1088 | + """ |
| 1089 | + uri: str = str(uri_ref) |
| 1090 | + |
| 1091 | + if uri.startswith(self.iri_base): |
| 1092 | + return uri.replace(self.iri_base, "").replace("entity/", "") |
| 1093 | + |
| 1094 | + return uri |
| 1095 | + |
| 1096 | + |
| 1097 | + def load_bootstrap_ttl ( # pylint: disable=R0912,R0914 |
| 1098 | + self, |
| 1099 | + ttl_str: str, |
| 1100 | + *, |
| 1101 | + debug: bool = False, |
| 1102 | + ) -> None: |
| 1103 | + """ |
| 1104 | +Parse a TTL string with an RDF semantic graph representation to load |
| 1105 | +bootstrap definitions for the _lemma graph_ prior to parsing, e.g., |
| 1106 | +for synonyms. |
| 1107 | +
|
| 1108 | + ttl_str: |
| 1109 | +RDF triples in TTL (Turtle/N3) format |
| 1110 | +
|
| 1111 | + debug: |
| 1112 | +debugging flag |
| 1113 | + """ |
| 1114 | + rdf_graph: rdflib.Graph = rdflib.Graph() |
| 1115 | + rdf_graph.parse(data = ttl_str) |
| 1116 | + |
| 1117 | + rdf_nodes: typing.Dict[ str, dict ] = defaultdict(dict) |
| 1118 | + rdf_edges: typing.Set[ tuple ] = set() |
| 1119 | + |
| 1120 | + # parse the node data, tally the edges |
| 1121 | + for subj, pred, obj in rdf_graph: |
| 1122 | + uri: str = self.denormalize_iri(subj) |
| 1123 | + |
| 1124 | + if pred == rdflib.SKOS.prefLabel: |
| 1125 | + rdf_nodes[uri]["label"] = str(obj) |
| 1126 | + elif pred == rdflib.SKOS.definition: |
| 1127 | + rdf_nodes[uri]["descrip"] = str(obj) |
| 1128 | + |
| 1129 | + elif pred == rdflib.RDF.type: |
| 1130 | + dst: str = str(obj) |
| 1131 | + rdf_nodes[dst]["ref"] = True |
| 1132 | + rdf_nodes[uri]["type"] = dst |
| 1133 | + |
| 1134 | + else: |
| 1135 | + src: str = uri |
| 1136 | + rdf_nodes[src]["ref"] = True |
| 1137 | + |
| 1138 | + dst = self.denormalize_iri(obj) |
| 1139 | + rdf_nodes[dst]["ref"] = True |
| 1140 | + |
| 1141 | + rdf_edges.add(( str(pred), src, dst, )) |
| 1142 | + |
| 1143 | + # construct the nodes |
| 1144 | + for uri, node_dat in rdf_nodes.items(): |
| 1145 | + if "ref" in node_dat: |
| 1146 | + if debug: |
| 1147 | + ic(uri, node_dat) |
| 1148 | + |
| 1149 | + kind: NodeEnum = NodeEnum.ENT |
| 1150 | + |
| 1151 | + if re.search(r"http[s]*://", uri) is not None: |
| 1152 | + kind = NodeEnum.IRI |
| 1153 | + |
| 1154 | + node: Node = self.make_node( |
| 1155 | + [], |
| 1156 | + uri, |
| 1157 | + None, |
| 1158 | + kind, |
| 1159 | + 0, |
| 1160 | + 0, |
| 1161 | + 0, |
| 1162 | + label = node_dat["label"], |
| 1163 | + length = len(node_dat["label"].split(" ")), |
| 1164 | + ) |
| 1165 | + |
| 1166 | + node.count = 0 |
| 1167 | + node.loc = [] |
| 1168 | + |
| 1169 | + if "type" in node_dat: |
| 1170 | + node.pos = node_dat["type"] |
| 1171 | + |
| 1172 | + if "descrip" in node_dat: |
| 1173 | + node.text = node_dat["descrip"] |
| 1174 | + |
| 1175 | + if debug: |
| 1176 | + ic(node) |
| 1177 | + |
| 1178 | + # construct the edges |
| 1179 | + node_list: typing.List[ Node ] = list(self.nodes.values()) |
| 1180 | + |
| 1181 | + for rel, src, dst in rdf_edges: |
| 1182 | + src_node: Node = self.nodes[src] |
| 1183 | + dst_node: Node = self.nodes[dst] |
| 1184 | + |
| 1185 | + if debug: |
| 1186 | + print(rel, node_list.index(src_node), node_list.index(dst_node)) |
| 1187 | + |
| 1188 | + edge: Edge = self.make_edge( # type: ignore |
| 1189 | + src_node, |
| 1190 | + dst_node, |
| 1191 | + RelEnum.IRI, |
| 1192 | + rel, |
| 1193 | + 1.0, |
| 1194 | + debug = debug, |
| 1195 | + ) |
| 1196 | + |
| 1197 | + if debug: |
| 1198 | + ic(edge) |
0 commit comments