Skip to content

Commit 87c5310

Browse files
committed
Merge branch 'main' of https://github.com/microsoft/e8c22
2 parents 32636a5 + 1170f28 commit 87c5310

File tree

3 files changed

+566
-0
lines changed

3 files changed

+566
-0
lines changed

util/graphml2ontology.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
"""
2+
Usage:
3+
python graphml2ontology.py
4+
Note: pre-define input_graphml, ontology_ttl and instances_ttl variables
5+
"""
6+
7+
from rdflib import Graph, URIRef, Literal, Namespace, BNode
8+
from rdflib.namespace import RDF, RDFS, OWL, XSD
9+
import xml.etree.ElementTree as ET
10+
import re
11+
import json
12+
13+
input_graphml = "../data/graphml/create_clustered_graph.graphml"
14+
instances_ttl = "../data/ttl/clustered_graph.ttl"
15+
ontology_ttl = "../data/ontologies/clustered_graph_ontology.ttl"
16+
#text_units_json = "../data/graphml/create_base_text_units.parquet.as.json"
17+
18+
def sanitize_uri(value):
19+
return re.sub(r'[^a-zA-Z0-9-]', '_', str(value).strip())
20+
21+
def extract_title(text):
22+
"""Extract title from text field using regex"""
23+
match = re.search(r'title:\s*(.*?)(\.\n|\n|$)', text)
24+
return match.group(1).strip() if match else None
25+
26+
# def load_text_units(json_path):
27+
# id_to_text = {}
28+
# with open(json_path, 'r') as f:
29+
# for line in f:
30+
# try:
31+
# unit = json.loads(line)
32+
# if 'id' in unit and 'text' in unit:
33+
# id_to_text[unit['id']] = unit['text']
34+
# except json.JSONDecodeError as e:
35+
# print(f"Error decoding JSON: {e}")
36+
# continue
37+
# return id_to_text
38+
39+
def graphml_to_ttl_with_ontology(graphml_path, instance_path, ontology_path):
40+
# Load text units data
41+
# id_to_title = {}
42+
# text_units = load_text_units(text_units_json)
43+
# for id,text in text_units.items():
44+
# title = extract_title(text)
45+
# if title:
46+
# id_to_title[id] = title
47+
48+
# Initialize graphs
49+
#instance_g = Graph()
50+
ontology_g = Graph()
51+
52+
# Define namespaces
53+
BASE = Namespace("http://example.org/ontology/")
54+
INST = Namespace("http://example.org/instances/")
55+
56+
ontology_g.bind("base", BASE)
57+
ontology_g.bind("owl", OWL)
58+
ontology_g.bind("inst", INST)
59+
#instance_g.bind("inst", INST)
60+
#instance_g.bind("base", BASE)
61+
62+
# Parse GraphML
63+
tree = ET.parse(graphml_path)
64+
root = tree.getroot()
65+
ns = {'graphml': 'http://graphml.graphdrawing.org/xmlns'}
66+
67+
# Extract attribute keys and build ontology
68+
keys = {}
69+
for key in root.findall('.//graphml:key', ns):
70+
key_id = key.attrib['id']
71+
keys[key_id] = {
72+
'for': key.attrib['for'],
73+
'name': key.attrib['attr.name'],
74+
'type': key.attrib['attr.type']
75+
}
76+
77+
# Create datatype properties in ontology
78+
prop_uri = BASE[key.attrib['attr.name']]
79+
xsd_type = get_xsd_type(key.attrib['attr.type'])
80+
81+
ontology_g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
82+
ontology_g.add((prop_uri, RDFS.domain, BASE.Node))
83+
ontology_g.add((prop_uri, RDFS.range, xsd_type))
84+
85+
# Manually add title property if not defined in GraphML
86+
if not any(v['name'] == 'title' for v in keys.values()):
87+
ontology_g.add((BASE.title, RDF.type, OWL.DatatypeProperty))
88+
ontology_g.add((BASE.title, RDFS.domain, BASE.Node))
89+
ontology_g.add((BASE.title, RDFS.range, XSD.string))
90+
91+
# Existing ontology setup...
92+
ontology_g.add((BASE.Node, RDF.type, OWL.Class))
93+
ontology_g.add((BASE.relatesTo, RDF.type, OWL.ObjectProperty))
94+
ontology_g.add((BASE.relatesTo, RDFS.domain, BASE.Node))
95+
ontology_g.add((BASE.relatesTo, RDFS.range, BASE.Node))
96+
ontology_g.add((BASE.relatedBy, RDF.type, OWL.ObjectProperty))
97+
ontology_g.add((BASE.relatedBy, OWL.inverseOf, BASE.relatesTo))
98+
99+
# Process nodes
100+
node_map = {}
101+
for node in root.findall('.//graphml:node', ns):
102+
data = {}
103+
for d in node.findall('graphml:data', ns):
104+
key = keys[d.attrib['key']]
105+
value = convert_value(d.text, key['type'])
106+
data[key['name']] = value
107+
108+
# URI generation
109+
human_id = data.get('human_readable_id')
110+
xml_id = node.attrib['id']
111+
112+
if human_id:
113+
node_uri = INST[sanitize_uri(human_id)]
114+
else:
115+
node_uri = INST[f"xmlid_{sanitize_uri(xml_id)}"]
116+
117+
if 'title' in data:
118+
node_id = data.get('title')
119+
node_uri = INST[sanitize_uri(node_id)]
120+
121+
node_map[xml_id] = node_uri
122+
#instance_g.add((node_uri, RDF.type, BASE.Node))
123+
has_non_alphanumeric = any(not char.isalnum() for char in node_id)
124+
if not has_non_alphanumeric and not any(char.isdigit() for char in node_id) and not node_id.startswith("_") and not node_id.startswith("-"):
125+
ontology_g.add((node_uri, RDF.type, BASE.Node))
126+
127+
# Add title as property if present
128+
# if 'title' in data:
129+
# instance_g.add((node_uri, BASE.title, Literal(data['title'])))
130+
131+
# Add other properties
132+
#for attr, value in data.items():
133+
# if attr != 'title' and value is not None:
134+
# pred = BASE[attr]
135+
# instance_g.add((node_uri, pred, Literal(value)))
136+
# Process edges with comprehensive node mapping
137+
# for edge in root.findall('.//graphml:edge', ns):
138+
# source_id = edge.attrib['source']
139+
# target_id = edge.attrib['target']
140+
# source = node_map.get(source_id)
141+
# target = node_map.get(target_id)
142+
143+
# # Create reified statement for edge properties
144+
# statement = BNode()
145+
# instance_g.add((statement, RDF.type, RDF.Statement))
146+
# instance_g.add((statement, RDF.subject, source))
147+
# instance_g.add((statement, RDF.predicate, BASE.relatesTo))
148+
# instance_g.add((statement, RDF.object, target))
149+
150+
# # Add edge properties to the statement
151+
# for d in edge.findall('graphml:data', ns):
152+
# key = d.attrib['key']
153+
# prop_name = keys[key]['name']
154+
# value = convert_value(d.text, keys[key]['type'])
155+
# instance_g.add((statement, BASE[prop_name], Literal(value)))
156+
157+
# if prop_name == 'text_unit_ids':
158+
# titles = []
159+
# for tid in d.text.split(', '):
160+
# if tid.strip() in id_to_title:
161+
# titles.append(id_to_title[tid.strip()])
162+
# if titles:
163+
# instance_g.add((statement, BASE.sourceFiles, Literal(', '.join(titles))))
164+
165+
# # Add direct relationships
166+
# instance_g.add((source, BASE.relatesTo, target))
167+
# instance_g.add((target, BASE.relatedBy, source))
168+
169+
# # Serialize
170+
# instance_g.serialize(destination=instance_path, format="turtle")
171+
ontology_g.serialize(destination=ontology_path, format="turtle")
172+
ontology_g.serialize(destination=ontology_path+".owl", format="xml")
173+
174+
def convert_value(value, graphml_type):
175+
"""Convert values according to GraphML type specifications"""
176+
if not value:
177+
return None
178+
try:
179+
if graphml_type == 'long':
180+
return int(float(value)) # Handle scientific notation
181+
elif graphml_type == 'double':
182+
return float(value)
183+
return value
184+
except (ValueError, TypeError):
185+
return value
186+
187+
def get_xsd_type(graphml_type):
188+
"""Map GraphML types to XSD types"""
189+
return {
190+
'string': XSD.string,
191+
'long': XSD.integer,
192+
'double': XSD.double
193+
}.get(graphml_type, XSD.string)
194+
195+
# Usage
196+
graphml_to_ttl_with_ontology(
197+
input_graphml,
198+
instances_ttl,
199+
ontology_ttl
200+
)

util/graphml2ttl.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""
2+
Usage:
3+
4+
python graphml2ttl.py
5+
Note: pre-define input_graphml, ontology_ttl and instances_ttl variables
6+
7+
"""
8+
9+
# A graphml ontology extractor
10+
# Aleksey Savateyev, Microsoft, 2025
11+
12+
from rdflib import Graph, URIRef, Literal, Namespace, BNode
13+
from rdflib.namespace import RDF, RDFS, OWL, XSD
14+
import xml.etree.ElementTree as ET
15+
import re
16+
17+
input_graphml = "../data/graphml/create_clustered_graph.graphml" # Path to the input GraphML file
18+
instances_ttl = "../data/ttl/clustered_graph.ttl" # Path to the output RDF Turtle file
19+
ontology_ttl = "../data/ontologies/clustered_graph_ontology.ttl" # Path to the output ontology TTL file
20+
21+
def sanitize_uri(value):
22+
"""Convert strings to URI-safe format"""
23+
return re.sub(r'[^a-zA-Z0-9-]', '_', str(value).strip())
24+
25+
def graphml_to_ttl_with_ontology(graphml_path, instance_path, ontology_path):
26+
# Initialize graphs
27+
instance_g = Graph()
28+
ontology_g = Graph()
29+
# Define namespaces
30+
BASE = Namespace("http://example.org/ontology/")
31+
INST = Namespace("http://example.org/instances/")
32+
ontology_g.bind("base", BASE)
33+
ontology_g.bind("owl", OWL)
34+
ontology_g.bind("inst", INST)
35+
instance_g.bind("inst", INST)
36+
instance_g.bind("base", BASE)
37+
# Parse GraphML
38+
tree = ET.parse(graphml_path)
39+
root = tree.getroot()
40+
ns = {'graphml': 'http://graphml.graphdrawing.org/xmlns'}
41+
# Extract attribute keys and build ontology
42+
keys = {}
43+
for key in root.findall('.//graphml:key', ns):
44+
if key.attrib['attr.name'] == 'description':
45+
continue
46+
key_id = key.attrib['id']
47+
keys[key_id] = {
48+
'for': key.attrib['for'],
49+
'name': key.attrib['attr.name'],
50+
'type': key.attrib['attr.type']
51+
}
52+
# Create datatype properties in ontology with correct types
53+
prop_uri = BASE[key.attrib['attr.name']]
54+
if key.attrib['for'] == 'node':
55+
ontology_g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
56+
ontology_g.add((prop_uri, RDFS.domain, BASE.Node))
57+
else:
58+
ontology_g.add((prop_uri, RDF.type, OWL.DatatypeProperty))
59+
ontology_g.add((prop_uri, RDFS.domain, BASE.relatesTo))
60+
# Add range based on GraphML type
61+
xsd_type = get_xsd_type(key.attrib['attr.type'])
62+
ontology_g.add((prop_uri, RDFS.range, xsd_type))
63+
# Create ontology classes and single relationship
64+
ontology_g.add((BASE.Node, RDF.type, OWL.Class))
65+
66+
ontology_g.add((BASE.relatesTo, RDF.type, OWL.ObjectProperty))
67+
ontology_g.add((BASE.relatesTo, RDFS.domain, BASE.Node))
68+
ontology_g.add((BASE.relatesTo, RDFS.range, BASE.Node))
69+
# Add relatedBy as the inverse of relatesTo
70+
ontology_g.add((BASE.relatedBy, RDF.type, OWL.ObjectProperty))
71+
ontology_g.add((BASE.relatedBy, OWL.inverseOf, BASE.relatesTo))
72+
ontology_g.add((BASE.relatedBy, RDFS.domain, BASE.Node))
73+
ontology_g.add((BASE.relatedBy, RDFS.range, BASE.Node))
74+
# Node ID to URI mapping (using both XML IDs and data IDs)
75+
node_map = {} # Maps GraphML node IDs to URIs
76+
id_map = {} # Maps data IDs (d0) to URIs
77+
# Process nodes with strict type enforcement
78+
for node in root.findall('.//graphml:node', ns):
79+
data = {}
80+
for d in node.findall('graphml:data', ns):
81+
if d.attrib['key'] == 'd5':
82+
continue
83+
key = keys[d.attrib['key']]
84+
value = convert_value(d.text, key['type'])
85+
data[key['name']] = value
86+
# Create URI using title if available, otherwise use human_readable_id or data ID
87+
title = data.get('title')
88+
human_id = data.get('human_readable_id')
89+
node_id = data.get('id')
90+
# Priority: title > human_readable_id > node ID > XML ID
91+
if title:
92+
node_uri = INST[sanitize_uri(title)]
93+
elif human_id:
94+
node_uri = INST[f"id_{sanitize_uri(human_id)}"]
95+
elif node_id:
96+
node_uri = INST[f"id_{sanitize_uri(node_id)}"]
97+
else:
98+
node_uri = INST[f"xmlid_{sanitize_uri(node.attrib['id'])}"]
99+
# Map all possible identifiers
100+
node_map[node.attrib['id']] = node_uri
101+
if node_id:
102+
id_map[node_id] = node_uri
103+
instance_g.add((node_uri, RDF.type, BASE.Node))
104+
ontology_g.add((node_uri, RDF.type, BASE.Node))
105+
for attr, value in data.items():
106+
if value is not None and attr != 'description' and attr != 'title':
107+
pred = BASE[attr]
108+
if attr == 'text_unit_ids':
109+
for item in value.split(', '):
110+
instance_g.add((node_uri, pred, Literal(item.strip())))
111+
else:
112+
instance_g.add((node_uri, pred, Literal(value, datatype=get_xsd_type(keys[next(k for k,v in keys.items() if v['name'] == attr)]['type']))))
113+
# Process edges with comprehensive node mapping
114+
for edge in root.findall('.//graphml:edge', ns):
115+
source_id = edge.attrib['source']
116+
target_id = edge.attrib['target']
117+
source = node_map.get(source_id)
118+
target = node_map.get(target_id)
119+
120+
# Create reified statement for edge properties
121+
statement = BNode()
122+
instance_g.add((statement, RDF.type, RDF.Statement))
123+
instance_g.add((statement, RDF.subject, source))
124+
instance_g.add((statement, RDF.predicate, BASE.relatesTo))
125+
instance_g.add((statement, RDF.object, target))
126+
127+
# Add edge properties to the statement
128+
for d in edge.findall('graphml:data', ns):
129+
key = d.attrib['key']
130+
prop_name = keys[key]['name']
131+
value = convert_value(d.text, keys[key]['type'])
132+
133+
instance_g.add((statement, BASE[prop_name], Literal(value)))
134+
135+
# Add direct relationships
136+
instance_g.add((source, BASE.relatesTo, target))
137+
instance_g.add((target, BASE.relatedBy, source))
138+
139+
# Serialize
140+
instance_g.serialize(destination=instance_path, format="turtle")
141+
ontology_g.serialize(destination=ontology_path, format="turtle")
142+
ontology_g.serialize(destination=ontology_path+".owl", format="xml")
143+
144+
def convert_value(value, graphml_type):
145+
"""Convert values according to GraphML type specifications"""
146+
if not value:
147+
return None
148+
try:
149+
if graphml_type == 'long':
150+
return int(float(value)) # Handle scientific notation
151+
elif graphml_type == 'double':
152+
return float(value)
153+
return value
154+
except (ValueError, TypeError):
155+
return value
156+
157+
def get_xsd_type(graphml_type):
158+
"""Map GraphML types to XSD types"""
159+
return {
160+
'string': XSD.string,
161+
'long': XSD.integer,
162+
'double': XSD.double
163+
}.get(graphml_type, XSD.string)
164+
165+
# Usage
166+
graphml_to_ttl_with_ontology(
167+
input_graphml,
168+
instances_ttl,
169+
ontology_ttl
170+
)

0 commit comments

Comments
 (0)