Skip to content

Commit

Permalink
Making some progress
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Feb 19, 2025
1 parent 823629d commit 072bc1d
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 43 deletions.
198 changes: 160 additions & 38 deletions olmocr/bench/runbench.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# This script runs olmocr bench
# It will take in as arguments a folder, and scan it for .jsonl files which contain the various rules and properties that we will check
# We will then validate the json files to make sure they are all valid
# Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate
# We will validate that each one of those contains a .md file coressponding to its parse for every .pdf in the /pdfs folder
# Then, we will read each one, and check if they pass against all the rules
#!/usr/bin/env python3
"""
This script runs olmocr bench.
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
It will then validate the JSON files to make sure they are all valid.
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
We will validate that each one of those contains a .md file corresponding to its parse for every .pdf in the /pdfs folder.
Then, we will read each one, and check if they pass against all the rules.
"""

import argparse
import os
import json
import glob
import sys

from rapidfuzz import fuzz

def validate_jsonl_file(jsonl_path: str, all_pdf_files: list[str]):
"""
Expand All @@ -22,7 +28,7 @@ def validate_jsonl_file(jsonl_path: str, all_pdf_files: list[str]):
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
# You could decide if blank lines are okay or not
# Skip blank lines
continue
try:
data = json.loads(line)
Expand All @@ -32,9 +38,9 @@ def validate_jsonl_file(jsonl_path: str, all_pdf_files: list[str]):
# Basic checks to ensure required keys exist (pdf, id, type, etc.)
if "pdf" not in data or "id" not in data or "type" not in data:
raise ValueError(f"Missing required fields in line {line_num} of {jsonl_path}: {data}")

rule_id = data["id"]

# Make sure the document referenced exists
if data["pdf"] not in all_pdf_basenames:
raise ValueError(f"Missing pdf {data['pdf']} referenced by {rule_id} in {jsonl_path} line {line_num}")
Expand All @@ -50,35 +56,106 @@ def validate_jsonl_file(jsonl_path: str, all_pdf_files: list[str]):
raise ValueError(f"'anchor' field required for rule type 'order' in {jsonl_path} line {line_num}")
if not ("before" in data or "after" in data):
raise ValueError(f"'before' or 'after' required for rule type 'order' in {jsonl_path} line {line_num}")
else:
raise ValueError(f"Unknown rule type '{rule_type}' in {jsonl_path} line {line_num}")

# If everything looks good, add to the rules list
rules.append(data)

return rules


def run_rule(rule, md_file_path: str) -> bool:
with open(md_file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
"""
Run the given rule on the content of the provided .md file.
Returns True if the rule passes, False otherwise.
"""
try:
with open(md_file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
except Exception as e:
print(f"Error reading {md_file_path}: {e}")
return False

# Example skeleton checks
rule_type = rule["type"]

if rule_type == "present":
return rule["text"] in md_content
elif rule_type == "absent":
return rule["text"] not in md_content
if rule_type == "present" or rule_type == "absent":
reference_query = rule["text"]
threshold = rule.get("threshold", 1.0)

best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0

if rule_type == "present":
return best_ratio >= threshold
else:
return best_ratio < threshold
elif rule_type == "order":
# Check ordering constraints, e.g. anchor vs. before/after
# This is highly dependent on how you want to define "order" in text.
# For instance, you might do:
# Implement a simple ordering check: ensure that the anchor text appears,
# and if 'before' is specified, it must appear before the anchor;
# if 'after' is specified, it must appear after the anchor.
anchor = rule.get("anchor", "")
before = rule.get("before", "")
after = rule.get("after", "")
# ...
# Implement your logic to confirm that anchor occurs before or after certain text
return False
before = rule.get("before")
after = rule.get("after")

anchor_index = md_content.find(anchor)
if anchor_index == -1:
return False

if before is not None:
before_index = md_content.find(before)
# If 'before' text not found or appears after (or at) the anchor, fail.
if before_index == -1 or before_index >= anchor_index:
return False

raise NotImplementedError
if after is not None:
after_index = md_content.find(after)
# If 'after' text not found or appears before (or at) the anchor, fail.
if after_index == -1 or after_index <= anchor_index:
return False

return True

else:
raise NotImplementedError(f"Rule type '{rule_type}' is not implemented.")

def evaluate_candidate(candidate_folder: str, all_rules: list, pdf_basenames: list[str]):
"""
For the candidate folder (pipeline tool output), first validate that it contains
a .md file for every PDF in the pdf folder. Then, run each rule against the corresponding
.md file.
Returns a tuple (num_passed, total_rules, errors) where errors is a list of strings.
"""
errors = []
candidate_name = os.path.basename(candidate_folder)
num_passed = 0
total_rules = 0

# Validate that a .md file exists for every PDF.
for pdf_name in pdf_basenames:
# Change .pdf extension to .md (assumes pdf_name ends with .pdf)
md_name = os.path.splitext(pdf_name)[0] + ".md"
md_path = os.path.join(candidate_folder, md_name)
if not os.path.exists(md_path):
errors.append(f"Candidate '{candidate_name}' is missing {md_name} corresponding to {pdf_name}.")

if errors:
# If candidate fails the md file existence check, do not evaluate further.
return (0, len(all_rules), errors)

# Evaluate rules. Each rule references a PDF (e.g., "doc1.pdf"), and we expect the candidate to have "doc1.md".
for rule in all_rules:
pdf_name = rule["pdf"]
md_name = os.path.splitext(pdf_name)[0] + ".md"
md_path = os.path.join(candidate_folder, md_name)
total_rules += 1
try:
if run_rule(rule, md_path):
num_passed += 1
except Exception as e:
errors.append(f"Error running rule {rule.get('id')} on {md_name}: {e}")

return (num_passed, total_rules, errors)

def main():
parser = argparse.ArgumentParser(description="Run OLMOCR Bench.")
Expand All @@ -90,14 +167,25 @@ def main():
input_folder = args.input_folder
pdf_folder = os.path.join(input_folder, "pdfs")

# Find all pdf files in the data folder
assert os.path.exists(pdf_folder), "/pdfs folder must exist in your data directory"
# Check that the pdfs folder exists
if not os.path.exists(pdf_folder):
print("Error: /pdfs folder must exist in your data directory.", file=sys.stderr)
sys.exit(1)

# Find all pdf files in the pdf folder
all_pdf_files = list(glob.glob(os.path.join(pdf_folder, "*.pdf")))
assert all_pdf_files, f"No PDF files found in {pdf_folder}"
if not all_pdf_files:
print(f"Error: No PDF files found in {pdf_folder}", file=sys.stderr)
sys.exit(1)

# Get PDF basenames (e.g. "doc1.pdf")
pdf_basenames = [os.path.basename(p) for p in all_pdf_files]

# Find .jsonl files and validate them
# Find .jsonl files in the input folder and validate them
jsonl_files = glob.glob(os.path.join(input_folder, "*.jsonl"))
assert jsonl_files, f"No .jsonl files found in {input_folder}."
if not jsonl_files:
print(f"Error: No .jsonl files found in {input_folder}.", file=sys.stderr)
sys.exit(1)

all_rules = []
for jsonl_path in jsonl_files:
Expand All @@ -106,14 +194,48 @@ def main():
rules = validate_jsonl_file(jsonl_path, all_pdf_files)
all_rules.extend(rules)
except ValueError as e:
print(f"Validation error in {jsonl_path}: {e}")
return

# Now, find all of the other folders in the input folder, those become the candidates
# Each candidate will then run each rule on its content

# At the end, print a summary of the score (number of passing rules) for each candidate
# Make it a pretty interface, similar to what pytest does
print(f"Validation error in {jsonl_path}: {e}", file=sys.stderr)
sys.exit(1)

if not all_rules:
print("No valid rules found. Exiting.", file=sys.stderr)
sys.exit(1)

# Identify candidate pipeline folders (subdirectories of input_folder excluding /pdfs)
candidate_folders = []
for entry in os.listdir(input_folder):
full_path = os.path.join(input_folder, entry)
if os.path.isdir(full_path) and entry != "pdfs":
candidate_folders.append(full_path)

if not candidate_folders:
print("Error: No candidate pipeline folders found (subdirectories besides 'pdfs').", file=sys.stderr)
sys.exit(1)

# Evaluate each candidate
summary = []
print("\nRunning rules for each candidate:")
for candidate in candidate_folders:
candidate_name = os.path.basename(candidate)
num_passed, total_rules, errors = evaluate_candidate(candidate, all_rules, pdf_basenames)
summary.append((candidate_name, num_passed, total_rules, errors))
print(f"\nCandidate: {candidate_name}")
if errors:
for err in errors:
print(f" [ERROR] {err}")
else:
print(f" Passed {num_passed} out of {total_rules} rules.")

# Print a final summary (similar to a pytest summary)
print("\n" + "="*50)
print("Final Summary:")
for candidate_name, num_passed, total_rules, errors in summary:
if errors:
status = "FAILED (errors)"
else:
status = f"{num_passed / total_rules * 100:0.1f}%"
print(f"{candidate_name:20s} : {num_passed:3d}/{total_rules:3d} rules passed - {status}")
print("="*50)

if __name__ == "__main__":
main()
6 changes: 1 addition & 5 deletions olmocr/bench/sample_data/dataset.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_05", "type": "present", "text": "Some writers on CSR trace its American roots to the 19th century when large industries engaged in philanthropy and established great public institutions", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_06", "type": "present", "text": "", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_07", "type": "present", "text": "", "threshold": 0.95}

{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_10", "type": "order", "anchor": "multi_column_miss_00", "before": "multi_column_miss_02"}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_10", "type": "order", "anchor": "multi_column_miss_04", "after": "multi_column_miss_02"}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_10", "type": "order", "anchor": "multi_column_miss_05", "after": "multi_column_miss_00", "before": "multi_column_miss_04"}



{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_10", "type": "order", "anchor": "multi_column_miss_05", "after": "multi_column_miss_00", "before": "multi_column_miss_04"}

0 comments on commit 072bc1d

Please sign in to comment.