Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
551 changes: 551 additions & 0 deletions scripts/generate_cco_iris.log

Large diffs are not rendered by default.

224 changes: 224 additions & 0 deletions scripts/generate_cco_iris.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#!/usr/bin/env python3
"""
generate_cco_iris.py — Regenerates per-IRI Turtle files for CCO IRI resolution.

Cloudflare redirects requests for https://www.commoncoreontologies.org/ont0000XXXX
to src/cco-iris/ont0000XXXX.ttl on the develop branch. This script regenerates
all of those files from the current merged ontology using ROBOT (OWL API), so the
output format is identical to the hand-generated originals.

Each output file is produced by:
robot filter --input merged.ttl --term <IRI> --select self --axioms all --trim false
annotate --ontology-iri <IRI>.ttl --output <IRI>.ttl

Files are generated in parallel (default: 8 workers) to keep runtime under ~3 min.

Usage:
python3 scripts/generate_cco_iris.py \\
--input src/cco-merged/CommonCoreOntologiesMerged.ttl \\
--output src/cco-iris \\
--robot-jar build/lib/robot.jar

# Tune parallelism (default 8):
python3 scripts/generate_cco_iris.py ... --workers 16
"""

import argparse
import concurrent.futures
import os
import re
import subprocess
import sys

from rdflib import Graph, URIRef

IRI_PATTERN = re.compile(r"^https://www\.commoncoreontologies\.org/ont(\d{8})$")


def is_cco_iri(uri: str) -> bool:
return bool(IRI_PATTERN.match(uri))


# The original hand-generated IRI files were produced by OWL API 4.5.29; ROBOT
# 1.8.4 emits a different version (4.5.6). We rewrite the generated tag to this
# canonical version so a generator-version difference alone never registers as a
# change — such files end up byte-identical to the originals (no churn, nothing
# to report).
OWLAPI_CANONICAL_VERSION = "4.5.29"
_OWLAPI_VERSION_RE = re.compile(r"(Generated by the OWL API \(version )[\d.]+(\))")


def _normalize_owlapi(text: str) -> str:
"""Rewrite the OWL API version tag to OWLAPI_CANONICAL_VERSION."""
return _OWLAPI_VERSION_RE.sub(rf"\g<1>{OWLAPI_CANONICAL_VERSION}\g<2>", text)


def run_robot(java_bin: str, robot_jar: str, merged_path: str, output_dir: str, iri: str) -> dict:
"""Run ROBOT filter+annotate for one IRI and classify the output against its
prior on-disk state. Returns {local, status, klass, msg} where klass is one of
NEW / CHANGED / COMMENT_ONLY / UNCHANGED (None on error)."""
local = iri.split("/")[-1] # e.g. ont00000001
out_path = os.path.join(output_dir, f"{local}.ttl")
ont_iri = f"{iri}.ttl"

existed = os.path.isfile(out_path)
old_text = ""
if existed:
with open(out_path, encoding="utf-8") as fh:
old_text = fh.read()

cmd = [
java_bin, "-jar", robot_jar,
"filter",
"--input", merged_path,
"--term", iri,
"--select", "self",
"--axioms", "all",
"--trim", "false",
"annotate",
"--ontology-iri", ont_iri,
"--output", out_path,
]

result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return {"local": local, "status": "ERROR", "klass": None,
"msg": result.stderr.strip()}

with open(out_path, encoding="utf-8") as fh:
new_text = fh.read()

# Suppress generator-version churn: normalize the OWL API tag so a file that
# differs only by that tag becomes identical to the committed original.
normalized = _normalize_owlapi(new_text)
if normalized != new_text:
with open(out_path, "w", encoding="utf-8") as fh:
fh.write(normalized)
new_text = normalized

if not existed:
klass = "NEW"
elif old_text == new_text:
klass = "UNCHANGED"
else:
klass = "CHANGED"

return {"local": local, "status": "OK", "klass": klass, "msg": ""}


def find_java() -> str:
"""Return a path to the java binary, preferring Homebrew OpenJDK on macOS."""
candidates = [
"/opt/homebrew/opt/openjdk/bin/java", # Apple-silicon Homebrew
"/usr/local/opt/openjdk/bin/java", # Intel Homebrew
"java", # already on PATH
]
for c in candidates:
if os.path.isfile(c) or (c == "java"):
try:
r = subprocess.run([c, "-version"], capture_output=True)
if r.returncode == 0:
return c
except FileNotFoundError:
continue
print("ERROR: java not found. Install with: brew install openjdk", file=sys.stderr)
sys.exit(1)


def generate(merged_path: str, output_dir: str, robot_jar: str, workers: int = 8,
limit: int = 0) -> None:
java_bin = find_java()
print(f"Using java: {java_bin}", flush=True)

print(f"Parsing {merged_path} for IRI list ...", flush=True)
src = Graph()
src.parse(merged_path, format="turtle")

cco_iris = sorted(
{str(s) for s in src.subjects() if isinstance(s, URIRef) and is_cco_iri(str(s))}
)
if limit > 0:
cco_iris = cco_iris[:limit]
print(f"Limiting to first {limit} IRIs (--limit flag).", flush=True)
print(f"Found {len(cco_iris)} CCO IRIs. Generating with {workers} parallel workers ...",
flush=True)

os.makedirs(output_dir, exist_ok=True)

counts = {"NEW": 0, "CHANGED": 0, "UNCHANGED": 0}
new_files: list[str] = []
changed_files: list[str] = []
errors: list[str] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
futures = {
pool.submit(run_robot, java_bin, robot_jar, merged_path, output_dir, iri): iri
for iri in cco_iris
}
for i, future in enumerate(concurrent.futures.as_completed(futures), 1):
r = future.result()
if r["status"] == "ERROR":
errors.append(f"ERROR {r['local']}: {r['msg']}")
else:
counts[r["klass"]] += 1
if r["klass"] == "NEW":
new_files.append(r["local"])
elif r["klass"] == "CHANGED":
changed_files.append(r["local"])
if i % 100 == 0 or i == len(cco_iris):
print(f" {i}/{len(cco_iris)} done", flush=True)

new_files.sort()
changed_files.sort()

print("\n" + "=" * 60, flush=True)
print("SUMMARY", flush=True)
print("=" * 60, flush=True)
print(f" Total IRIs processed : {len(cco_iris)}", flush=True)
print(f" New files : {counts['NEW']}", flush=True)
print(f" Real content changes : {counts['CHANGED']}", flush=True)
print(f" Unchanged : {counts['UNCHANGED']} "
f"(incl. files where only the OWL API version tag differed, normalized away)", flush=True)
print(f" Errors : {len(errors)}", flush=True)

print(f"\nNEW FILES ({len(new_files)}):", flush=True)
for name in new_files:
print(f" {name}", flush=True)

print(f"\nFILES WITH REAL CONTENT CHANGES ({len(changed_files)}):", flush=True)
for name in changed_files:
print(f" {name}", flush=True)

if errors:
print(f"\n{len(errors)} error(s):", file=sys.stderr)
for e in errors:
print(f" {e}", file=sys.stderr)
sys.exit(1)
print(f"\nDone. {len(cco_iris)} files written to {output_dir}/", flush=True)


def main() -> None:
parser = argparse.ArgumentParser(
description="Generate per-IRI OWL API-format Turtle files for CCO IRI dereferencing."
)
parser.add_argument("--input", required=True, help="Path to CommonCoreOntologiesMerged.ttl")
parser.add_argument("--output", required=True, help="Directory to write per-IRI .ttl files")
parser.add_argument("--robot-jar", default="build/lib/robot.jar", help="Path to robot.jar")
parser.add_argument("--workers", type=int, default=8, help="Parallel worker count (default 8)")
parser.add_argument("--limit", type=int, default=0, help="Only process the first N IRIs (0 = all)")
args = parser.parse_args()

if not os.path.isfile(args.input):
print(f"ERROR: input not found: {args.input}", file=sys.stderr)
sys.exit(1)
if not os.path.isfile(args.robot_jar):
print(f"ERROR: robot.jar not found: {args.robot_jar}", file=sys.stderr)
print("Download with: make setup (or: curl -L -o build/lib/robot.jar "
"https://github.com/ontodev/robot/releases/download/v1.8.4/robot.jar)",
file=sys.stderr)
sys.exit(1)

generate(args.input, args.output, args.robot_jar, workers=args.workers, limit=args.limit)


if __name__ == "__main__":
main()
Loading