Skip to content

Commit fbc9631

Browse files
committed
Refactor audit report generation to run directly on RDF fixtures and update documentation workflow; add test Actions workflow
1 parent 446d98f commit fbc9631

4 files changed

Lines changed: 227 additions & 12 deletions

File tree

.github/workflows/docs.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: Docs
2+
3+
on:
4+
push:
5+
branches: [main]
6+
7+
permissions:
8+
contents: read
9+
pages: write
10+
id-token: write
11+
12+
concurrency:
13+
group: pages
14+
cancel-in-progress: true
15+
16+
jobs:
17+
build:
18+
runs-on: ubuntu-latest
19+
steps:
20+
- uses: actions/checkout@v4
21+
22+
- uses: actions/setup-python@v5
23+
with:
24+
python-version: "3.11"
25+
26+
- name: Install dependencies
27+
run: pip install .[dev,docs]
28+
29+
- name: Build docs
30+
run: make docs-build
31+
32+
- uses: actions/upload-pages-artifact@v3
33+
with:
34+
path: site/
35+
36+
deploy:
37+
needs: build
38+
runs-on: ubuntu-latest
39+
environment:
40+
name: github-pages
41+
url: ${{ steps.deployment.outputs.page_url }}
42+
steps:
43+
- id: deployment
44+
uses: actions/deploy-pages@v4

.github/workflows/test.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- uses: actions/setup-python@v5
17+
with:
18+
python-version: "3.11"
19+
20+
- name: Install dependencies
21+
run: pip install .[dev]
22+
23+
- name: Run tests
24+
run: pytest

docs/generate_audit.py

Lines changed: 158 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
1-
"""Generate docs/audit.md from tests/audit_report.json."""
1+
"""Generate docs/audit.md by running audit() on all RDF fixtures directly.
22
3-
import json
3+
This runs as part of `make docs` so the audit report always reflects the
4+
current code and templates, with no stale intermediate JSON.
5+
"""
6+
7+
import statistics
8+
import subprocess
9+
import sys
10+
import time
11+
from datetime import datetime, timezone
412
from pathlib import Path
513

14+
from rdflib import Graph
15+
16+
from openbasement import audit, load_template
17+
618
ROOT = Path(__file__).resolve().parent.parent
7-
REPORT_PATH = ROOT / "tests" / "audit_report.json"
19+
FIXTURE_DIR = ROOT / "tests" / "fixtures" / "procedures"
820
OUTPUT_PATH = ROOT / "docs" / "audit.md"
21+
TEMPLATE_NAME = "eu_procedure"
922

1023
# Known URI prefixes for shortening
1124
PREFIXES = [
@@ -24,7 +37,6 @@ def shorten_uri(uri: str) -> str:
2437
for full, short in PREFIXES:
2538
if uri.startswith(full):
2639
return short + uri[len(full):]
27-
# Fallback: fragment after #
2840
if "#" in uri:
2941
return uri.rsplit("#", 1)[1]
3042
return uri
@@ -35,15 +47,27 @@ def fmt_pct(value: float) -> str:
3547
return f"{value * 100:.1f}%"
3648

3749

50+
def get_git_short_hash() -> str:
51+
"""Return the short git commit hash, or 'unknown' if not in a repo."""
52+
try:
53+
return subprocess.check_output(
54+
["git", "rev-parse", "--short", "HEAD"],
55+
cwd=ROOT,
56+
text=True,
57+
).strip()
58+
except Exception:
59+
return "unknown"
60+
61+
3862
def write_predicate_table(lines: list[str], predicates: dict, header: str) -> None:
3963
"""Write a predicate table (uncovered or covered)."""
4064
if not predicates:
4165
lines.append(f"*No {header.lower()} predicates.*\n")
4266
return
4367

4468
sorted_preds = sorted(predicates.items(), key=lambda x: x[1]["fixtures"], reverse=True)
45-
lines.append(f"| Predicate | Fixtures | Triples |")
46-
lines.append(f"|:----------|-------:|-------:|")
69+
lines.append("| Predicate | Fixtures | Triples |")
70+
lines.append("|:----------|-------:|-------:|")
4771
for uri, counts in sorted_preds:
4872
lines.append(f"| `{shorten_uri(uri)}` | {counts['fixtures']} | {counts['triples']} |")
4973
lines.append("")
@@ -63,20 +87,137 @@ def write_missing_table(lines: list[str], missing_freq: dict) -> None:
6387
lines.append("")
6488

6589

66-
def main() -> None:
67-
with open(REPORT_PATH, "r", encoding="utf-8") as f:
68-
report = json.load(f)
90+
def run_audit() -> dict:
91+
"""Run audit() on all fixtures and return an aggregated report dict.
92+
93+
Uses the same accumulation logic as tests/run_audit.py.
94+
"""
95+
template = load_template(TEMPLATE_NAME)
96+
rdf_files = sorted(FIXTURE_DIR.glob("*.rdf"))
97+
98+
if not rdf_files:
99+
print(f"No .rdf files found in {FIXTURE_DIR}", file=sys.stderr)
100+
sys.exit(1)
101+
102+
print(f"Auditing {len(rdf_files)} fixtures with template '{TEMPLATE_NAME}'...")
103+
104+
uncovered_counts: dict[str, dict[str, dict]] = {}
105+
covered_counts: dict[str, dict[str, dict]] = {}
106+
missing_counts: dict[str, dict[str, int]] = {}
107+
template_predicates: dict[str, set[str]] = {}
108+
109+
coverages: list[float] = []
110+
errors = 0
111+
t0 = time.time()
112+
113+
for i, rdf_file in enumerate(rdf_files, 1):
114+
if i % 100 == 0 or i == len(rdf_files):
115+
elapsed = time.time() - t0
116+
print(f" [{i}/{len(rdf_files)}] {elapsed:.1f}s elapsed")
117+
118+
try:
119+
g = Graph()
120+
g.parse(rdf_file, format="xml")
121+
result = audit(g, template)
122+
except Exception as e:
123+
print(f" ERROR parsing {rdf_file.name}: {e}", file=sys.stderr)
124+
errors += 1
125+
continue
126+
127+
coverages.append(result["summary"]["coverage"])
128+
fixture_id = rdf_file.stem
129+
130+
for entity_name, entity_report in result["entities"].items():
131+
if entity_name not in uncovered_counts:
132+
uncovered_counts[entity_name] = {}
133+
covered_counts[entity_name] = {}
134+
missing_counts[entity_name] = {}
135+
template_predicates[entity_name] = set()
136+
137+
for pred, count in entity_report["uncovered"].items():
138+
if pred not in uncovered_counts[entity_name]:
139+
uncovered_counts[entity_name][pred] = {"fixtures": set(), "triples": 0}
140+
uncovered_counts[entity_name][pred]["fixtures"].add(fixture_id)
141+
uncovered_counts[entity_name][pred]["triples"] += count
69142

143+
for pred, count in entity_report["covered"].items():
144+
if pred not in covered_counts[entity_name]:
145+
covered_counts[entity_name][pred] = {"fixtures": set(), "triples": 0}
146+
covered_counts[entity_name][pred]["fixtures"].add(fixture_id)
147+
covered_counts[entity_name][pred]["triples"] += count
148+
149+
for pred in entity_report["missing"]:
150+
template_predicates[entity_name].add(pred)
151+
missing_counts[entity_name][pred] = missing_counts[entity_name].get(pred, 0) + 1
152+
153+
for pred in entity_report["covered"]:
154+
template_predicates[entity_name].add(pred)
155+
156+
fixture_count = len(rdf_files) - errors
157+
158+
report = {
159+
"template": TEMPLATE_NAME,
160+
"fixture_count": fixture_count,
161+
"errors": errors,
162+
"coverage": {
163+
"mean": round(statistics.mean(coverages), 4) if coverages else 0,
164+
"median": round(statistics.median(coverages), 4) if coverages else 0,
165+
"min": round(min(coverages), 4) if coverages else 0,
166+
"max": round(max(coverages), 4) if coverages else 0,
167+
"stdev": round(statistics.stdev(coverages), 4) if len(coverages) > 1 else 0,
168+
},
169+
"entities": {},
170+
}
171+
172+
for entity_name in sorted(uncovered_counts.keys()):
173+
uncovered = {
174+
pred: {"fixtures": len(data["fixtures"]), "triples": data["triples"]}
175+
for pred, data in sorted(
176+
uncovered_counts[entity_name].items(),
177+
key=lambda x: (-len(x[1]["fixtures"]), -x[1]["triples"]),
178+
)
179+
}
180+
covered = {
181+
pred: {"fixtures": len(data["fixtures"]), "triples": data["triples"]}
182+
for pred, data in sorted(
183+
covered_counts[entity_name].items(),
184+
key=lambda x: (-len(x[1]["fixtures"]), -x[1]["triples"]),
185+
)
186+
}
187+
report["entities"][entity_name] = {
188+
"uncovered": uncovered,
189+
"covered": covered,
190+
"missing_frequency": {
191+
pred: count
192+
for pred, count in sorted(
193+
missing_counts[entity_name].items(),
194+
key=lambda x: -x[1],
195+
)
196+
},
197+
}
198+
199+
elapsed = time.time() - t0
200+
print(f"Audit complete in {elapsed:.1f}s ({fixture_count} fixtures, {errors} errors)")
201+
202+
return report
203+
204+
205+
def generate_markdown(report: dict) -> str:
206+
"""Convert an aggregated report dict into markdown."""
70207
template_name = report["template"]
71208
fixture_count = report["fixture_count"]
72209
errors = report["errors"]
73210
cov = report["coverage"]
74211

212+
commit_hash = get_git_short_hash()
213+
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
214+
75215
lines: list[str] = []
76216

77-
# Header
78217
lines.append("# Audit Coverage Report")
79218
lines.append("")
219+
lines.append(f"Generated from commit `{commit_hash}` on {timestamp}.")
220+
lines.append("")
80221
lines.append(
81222
f"Template: **{template_name}** | "
82223
f"Fixtures: **{fixture_count}** | "
@@ -137,7 +278,13 @@ def main() -> None:
137278
lines.append("")
138279
write_missing_table(lines, missing_freq)
139280

140-
OUTPUT_PATH.write_text("\n".join(lines), encoding="utf-8")
281+
return "\n".join(lines)
282+
283+
284+
def main() -> None:
285+
report = run_audit()
286+
markdown = generate_markdown(report)
287+
OUTPUT_PATH.write_text(markdown, encoding="utf-8")
141288
print(f"Wrote {OUTPUT_PATH}")
142289

143290

mkdocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
site_name: openbasement
22
site_description: Template-based RDF extraction from EU Cellar data
3-
repo_url: https://github.com/maxhaag/openbasement
3+
repo_url: https://github.com/openstage-eu/openbasement
44

55
theme:
66
name: openstage

0 commit comments

Comments
 (0)