|
| 1 | +""" |
| 2 | +Usage: |
| 3 | +python graphml2ontology.py |
| 4 | +Note: pre-define input_graphml, ontology_ttl and instances_ttl variables |
| 5 | +""" |
| 6 | + |
| 7 | +from rdflib import Graph, URIRef, Literal, Namespace, BNode |
| 8 | +from rdflib.namespace import RDF, RDFS, OWL, XSD |
| 9 | +import xml.etree.ElementTree as ET |
| 10 | +import re |
| 11 | +import json |
| 12 | + |
| 13 | +input_graphml = "../data/graphml/create_clustered_graph.graphml" |
| 14 | +instances_ttl = "../data/ttl/clustered_graph.ttl" |
| 15 | +ontology_ttl = "../data/ontologies/clustered_graph_ontology.ttl" |
| 16 | +#text_units_json = "../data/graphml/create_base_text_units.parquet.as.json" |
| 17 | + |
| 18 | +def sanitize_uri(value): |
| 19 | + return re.sub(r'[^a-zA-Z0-9-]', '_', str(value).strip()) |
| 20 | + |
| 21 | +def extract_title(text): |
| 22 | + """Extract title from text field using regex""" |
| 23 | + match = re.search(r'title:\s*(.*?)(\.\n|\n|$)', text) |
| 24 | + return match.group(1).strip() if match else None |
| 25 | + |
| 26 | +# def load_text_units(json_path): |
| 27 | +# id_to_text = {} |
| 28 | +# with open(json_path, 'r') as f: |
| 29 | +# for line in f: |
| 30 | +# try: |
| 31 | +# unit = json.loads(line) |
| 32 | +# if 'id' in unit and 'text' in unit: |
| 33 | +# id_to_text[unit['id']] = unit['text'] |
| 34 | +# except json.JSONDecodeError as e: |
| 35 | +# print(f"Error decoding JSON: {e}") |
| 36 | +# continue |
| 37 | +# return id_to_text |
| 38 | + |
| 39 | +def graphml_to_ttl_with_ontology(graphml_path, instance_path, ontology_path): |
| 40 | + # Load text units data |
| 41 | + # id_to_title = {} |
| 42 | + # text_units = load_text_units(text_units_json) |
| 43 | + # for id,text in text_units.items(): |
| 44 | + # title = extract_title(text) |
| 45 | + # if title: |
| 46 | + # id_to_title[id] = title |
| 47 | + |
| 48 | + # Initialize graphs |
| 49 | + #instance_g = Graph() |
| 50 | + ontology_g = Graph() |
| 51 | + |
| 52 | + # Define namespaces |
| 53 | + BASE = Namespace("http://example.org/ontology/") |
| 54 | + INST = Namespace("http://example.org/instances/") |
| 55 | + |
| 56 | + ontology_g.bind("base", BASE) |
| 57 | + ontology_g.bind("owl", OWL) |
| 58 | + ontology_g.bind("inst", INST) |
| 59 | + #instance_g.bind("inst", INST) |
| 60 | + #instance_g.bind("base", BASE) |
| 61 | + |
| 62 | + # Parse GraphML |
| 63 | + tree = ET.parse(graphml_path) |
| 64 | + root = tree.getroot() |
| 65 | + ns = {'graphml': 'http://graphml.graphdrawing.org/xmlns'} |
| 66 | + |
| 67 | + # Extract attribute keys and build ontology |
| 68 | + keys = {} |
| 69 | + for key in root.findall('.//graphml:key', ns): |
| 70 | + key_id = key.attrib['id'] |
| 71 | + keys[key_id] = { |
| 72 | + 'for': key.attrib['for'], |
| 73 | + 'name': key.attrib['attr.name'], |
| 74 | + 'type': key.attrib['attr.type'] |
| 75 | + } |
| 76 | + |
| 77 | + # Create datatype properties in ontology |
| 78 | + prop_uri = BASE[key.attrib['attr.name']] |
| 79 | + xsd_type = get_xsd_type(key.attrib['attr.type']) |
| 80 | + |
| 81 | + ontology_g.add((prop_uri, RDF.type, OWL.DatatypeProperty)) |
| 82 | + ontology_g.add((prop_uri, RDFS.domain, BASE.Node)) |
| 83 | + ontology_g.add((prop_uri, RDFS.range, xsd_type)) |
| 84 | + |
| 85 | + # Manually add title property if not defined in GraphML |
| 86 | + if not any(v['name'] == 'title' for v in keys.values()): |
| 87 | + ontology_g.add((BASE.title, RDF.type, OWL.DatatypeProperty)) |
| 88 | + ontology_g.add((BASE.title, RDFS.domain, BASE.Node)) |
| 89 | + ontology_g.add((BASE.title, RDFS.range, XSD.string)) |
| 90 | + |
| 91 | + # Existing ontology setup... |
| 92 | + ontology_g.add((BASE.Node, RDF.type, OWL.Class)) |
| 93 | + ontology_g.add((BASE.relatesTo, RDF.type, OWL.ObjectProperty)) |
| 94 | + ontology_g.add((BASE.relatesTo, RDFS.domain, BASE.Node)) |
| 95 | + ontology_g.add((BASE.relatesTo, RDFS.range, BASE.Node)) |
| 96 | + ontology_g.add((BASE.relatedBy, RDF.type, OWL.ObjectProperty)) |
| 97 | + ontology_g.add((BASE.relatedBy, OWL.inverseOf, BASE.relatesTo)) |
| 98 | + |
| 99 | + # Process nodes |
| 100 | + node_map = {} |
| 101 | + for node in root.findall('.//graphml:node', ns): |
| 102 | + data = {} |
| 103 | + for d in node.findall('graphml:data', ns): |
| 104 | + key = keys[d.attrib['key']] |
| 105 | + value = convert_value(d.text, key['type']) |
| 106 | + data[key['name']] = value |
| 107 | + |
| 108 | + # URI generation |
| 109 | + human_id = data.get('human_readable_id') |
| 110 | + xml_id = node.attrib['id'] |
| 111 | + |
| 112 | + if human_id: |
| 113 | + node_uri = INST[sanitize_uri(human_id)] |
| 114 | + else: |
| 115 | + node_uri = INST[f"xmlid_{sanitize_uri(xml_id)}"] |
| 116 | + |
| 117 | + if 'title' in data: |
| 118 | + node_id = data.get('title') |
| 119 | + node_uri = INST[sanitize_uri(node_id)] |
| 120 | + |
| 121 | + node_map[xml_id] = node_uri |
| 122 | + #instance_g.add((node_uri, RDF.type, BASE.Node)) |
| 123 | + has_non_alphanumeric = any(not char.isalnum() for char in node_id) |
| 124 | + if not has_non_alphanumeric and not any(char.isdigit() for char in node_id) and not node_id.startswith("_") and not node_id.startswith("-"): |
| 125 | + ontology_g.add((node_uri, RDF.type, BASE.Node)) |
| 126 | + |
| 127 | + # Add title as property if present |
| 128 | + # if 'title' in data: |
| 129 | + # instance_g.add((node_uri, BASE.title, Literal(data['title']))) |
| 130 | + |
| 131 | + # Add other properties |
| 132 | + #for attr, value in data.items(): |
| 133 | + # if attr != 'title' and value is not None: |
| 134 | + # pred = BASE[attr] |
| 135 | + # instance_g.add((node_uri, pred, Literal(value))) |
| 136 | + # Process edges with comprehensive node mapping |
| 137 | + # for edge in root.findall('.//graphml:edge', ns): |
| 138 | + # source_id = edge.attrib['source'] |
| 139 | + # target_id = edge.attrib['target'] |
| 140 | + # source = node_map.get(source_id) |
| 141 | + # target = node_map.get(target_id) |
| 142 | + |
| 143 | + # # Create reified statement for edge properties |
| 144 | + # statement = BNode() |
| 145 | + # instance_g.add((statement, RDF.type, RDF.Statement)) |
| 146 | + # instance_g.add((statement, RDF.subject, source)) |
| 147 | + # instance_g.add((statement, RDF.predicate, BASE.relatesTo)) |
| 148 | + # instance_g.add((statement, RDF.object, target)) |
| 149 | + |
| 150 | + # # Add edge properties to the statement |
| 151 | + # for d in edge.findall('graphml:data', ns): |
| 152 | + # key = d.attrib['key'] |
| 153 | + # prop_name = keys[key]['name'] |
| 154 | + # value = convert_value(d.text, keys[key]['type']) |
| 155 | + # instance_g.add((statement, BASE[prop_name], Literal(value))) |
| 156 | + |
| 157 | + # if prop_name == 'text_unit_ids': |
| 158 | + # titles = [] |
| 159 | + # for tid in d.text.split(', '): |
| 160 | + # if tid.strip() in id_to_title: |
| 161 | + # titles.append(id_to_title[tid.strip()]) |
| 162 | + # if titles: |
| 163 | + # instance_g.add((statement, BASE.sourceFiles, Literal(', '.join(titles)))) |
| 164 | + |
| 165 | + # # Add direct relationships |
| 166 | + # instance_g.add((source, BASE.relatesTo, target)) |
| 167 | + # instance_g.add((target, BASE.relatedBy, source)) |
| 168 | + |
| 169 | + # # Serialize |
| 170 | + # instance_g.serialize(destination=instance_path, format="turtle") |
| 171 | + ontology_g.serialize(destination=ontology_path, format="turtle") |
| 172 | + ontology_g.serialize(destination=ontology_path+".owl", format="xml") |
| 173 | + |
| 174 | +def convert_value(value, graphml_type): |
| 175 | + """Convert values according to GraphML type specifications""" |
| 176 | + if not value: |
| 177 | + return None |
| 178 | + try: |
| 179 | + if graphml_type == 'long': |
| 180 | + return int(float(value)) # Handle scientific notation |
| 181 | + elif graphml_type == 'double': |
| 182 | + return float(value) |
| 183 | + return value |
| 184 | + except (ValueError, TypeError): |
| 185 | + return value |
| 186 | + |
| 187 | +def get_xsd_type(graphml_type): |
| 188 | + """Map GraphML types to XSD types""" |
| 189 | + return { |
| 190 | + 'string': XSD.string, |
| 191 | + 'long': XSD.integer, |
| 192 | + 'double': XSD.double |
| 193 | + }.get(graphml_type, XSD.string) |
| 194 | + |
| 195 | +# Usage |
| 196 | +graphml_to_ttl_with_ontology( |
| 197 | + input_graphml, |
| 198 | + instances_ttl, |
| 199 | + ontology_ttl |
| 200 | +) |
0 commit comments