Skip to content

Commit

Permalink
Adding a trailing repetition test
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Mar 6, 2025
1 parent 07466e1 commit 9be696f
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 163 deletions.
132 changes: 74 additions & 58 deletions olmocr/bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from typing import Dict, List, Tuple, Optional

from .tests import BasePDFTest, load_tests
from .tests import BasePDFTest, RepetitionTest, load_tests
from .utils import calculate_bootstrap_ci, perform_permutation_test

def evaluate_candidate(
Expand Down Expand Up @@ -117,6 +117,12 @@ def main():
default=os.path.join(os.path.dirname(__file__), "sample_data"),
help="Path to the folder containing .jsonl files, /pdfs folder, and pipeline tool subfolders.",
)
parser.add_argument(
"--candidate",
type=str,
default=None,
help="Run test only for a single candidate"
)
parser.add_argument(
"--bootstrap_samples",
type=int,
Expand All @@ -131,16 +137,14 @@ def main():
)
parser.add_argument(
"--permutation_tests",
type=int,
default=10000,
help="Number of permutations for statistical test (default: 10000).",
action="store_true",
help="Run permutation testing",
)
args = parser.parse_args()

input_folder = args.input_folder
n_bootstrap = args.bootstrap_samples
ci_level = args.confidence_level
n_permutations = args.permutation_tests
pdf_folder = os.path.join(input_folder, "pdfs")

# Check that the pdfs folder exists
Expand Down Expand Up @@ -173,17 +177,28 @@ def main():
print("No valid tests found. Exiting.", file=sys.stderr)
sys.exit(1)

# Add in a default repeat test for every PDF that doesn't already have one
for pdf in pdf_basenames:
if not any(t.type == "repeat" for t in all_tests if t.pdf == pdf):
all_tests.append(RepetitionTest(id=f"{pdf}_repeat", pdf=pdf, page=1, type="repeat"))

# Identify candidate pipeline folders (subdirectories of input_folder excluding /pdfs)
candidate_folders = []
for entry in os.listdir(input_folder):
full_path = os.path.join(input_folder, entry)
if os.path.isdir(full_path) and entry != "pdfs":
candidate_folders.append(full_path)
if args.candidate is not None:
if entry == args.candidate:
candidate_folders.append(full_path)
else:
if os.path.isdir(full_path) and entry != "pdfs":
candidate_folders.append(full_path)

if not candidate_folders:
print("Error: No candidate pipeline folders found (subdirectories besides 'pdfs').", file=sys.stderr)
sys.exit(1)

candidate_folders.sort()

# Evaluate each candidate
summary = []
print("\nRunning tests for each candidate:")
Expand Down Expand Up @@ -238,62 +253,63 @@ def main():
print("")

# Perform pairwise permutation tests
print("\n" + "=" * 60)
print("Pairwise Permutation Tests:")

valid_candidates = [c for c in summary if not c[3]] # Filter out candidates with errors
olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" in c[0].lower()], key=lambda x: x[1], reverse=True)
non_olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" not in c[0].lower()], key=lambda x: x[1], reverse=True)

top_olmocr = olmocr_candidates[0] if olmocr_candidates else None
top_non_olmocr = non_olmocr_candidates[0] if non_olmocr_candidates else None
top_two_olmocr = olmocr_candidates[:2]

# Test 1: Top olmocr vs Top non-olmocr
if top_olmocr and top_non_olmocr:
olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1]
non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1]
olmocr_scores = top_olmocr[7] # all_test_scores
non_olmocr_scores = top_non_olmocr[7] # all_test_scores
if args.permutation_tests:
print("\n" + "=" * 60)
print("Pairwise Permutation Tests:")

diff, p_value = perform_permutation_test(
olmocr_scores, non_olmocr_scores, n_permutations=n_permutations
)
valid_candidates = [c for c in summary if not c[3]] # Filter out candidates with errors
olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" in c[0].lower()], key=lambda x: x[1], reverse=True)
non_olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" not in c[0].lower()], key=lambda x: x[1], reverse=True)

print(f"\nComparison 1: Top olmocr vs Top non-olmocr candidate")
print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)")
print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)")
print(f" p-value: {p_value:.4f}")
if p_value < 0.05:
print(f" Result: Statistically significant difference (p < 0.05)")
top_olmocr = olmocr_candidates[0] if olmocr_candidates else None
top_non_olmocr = non_olmocr_candidates[0] if non_olmocr_candidates else None
top_two_olmocr = olmocr_candidates[:2]

# Test 1: Top olmocr vs Top non-olmocr
if top_olmocr and top_non_olmocr:
olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1]
non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1]
olmocr_scores = top_olmocr[7] # all_test_scores
non_olmocr_scores = top_non_olmocr[7] # all_test_scores

diff, p_value = perform_permutation_test(
olmocr_scores, non_olmocr_scores
)

print(f"\nComparison 1: Top olmocr vs Top non-olmocr candidate")
print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)")
print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)")
print(f" p-value: {p_value:.4f}")
if p_value < 0.05:
print(f" Result: Statistically significant difference (p < 0.05)")
else:
print(f" Result: No statistically significant difference (p ≥ 0.05)")
else:
print(f" Result: No statistically significant difference (p ≥ 0.05)")
else:
print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates")

# Test 2: Top two olmocr candidates (if there are at least two)
if len(top_two_olmocr) >= 2:
olmocr1_name, olmocr1_score = top_two_olmocr[0][0], top_two_olmocr[0][1]
olmocr2_name, olmocr2_score = top_two_olmocr[1][0], top_two_olmocr[1][1]
olmocr1_scores = top_two_olmocr[0][7] # all_test_scores
olmocr2_scores = top_two_olmocr[1][7] # all_test_scores

diff, p_value = perform_permutation_test(
olmocr1_scores, olmocr2_scores, n_permutations=n_permutations
)
print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates")

print(f"\nComparison 2: Top two olmocr candidates")
print(f" {olmocr1_name} ({olmocr1_score*100:.1f}%) vs {olmocr2_name} ({olmocr2_score*100:.1f}%)")
print(f" Difference: {diff*100:.2f}% (positive means {olmocr1_name} is better)")
print(f" p-value: {p_value:.4f}")
if p_value < 0.05:
print(f" Result: Statistically significant difference (p < 0.05)")
# Test 2: Top two olmocr candidates (if there are at least two)
if len(top_two_olmocr) >= 2:
olmocr1_name, olmocr1_score = top_two_olmocr[0][0], top_two_olmocr[0][1]
olmocr2_name, olmocr2_score = top_two_olmocr[1][0], top_two_olmocr[1][1]
olmocr1_scores = top_two_olmocr[0][7] # all_test_scores
olmocr2_scores = top_two_olmocr[1][7] # all_test_scores

diff, p_value = perform_permutation_test(
olmocr1_scores, olmocr2_scores
)

print(f"\nComparison 2: Top two olmocr candidates")
print(f" {olmocr1_name} ({olmocr1_score*100:.1f}%) vs {olmocr2_name} ({olmocr2_score*100:.1f}%)")
print(f" Difference: {diff*100:.2f}% (positive means {olmocr1_name} is better)")
print(f" p-value: {p_value:.4f}")
if p_value < 0.05:
print(f" Result: Statistically significant difference (p < 0.05)")
else:
print(f" Result: No statistically significant difference (p ≥ 0.05)")
else:
print(f" Result: No statistically significant difference (p ≥ 0.05)")
else:
print("\nCannot perform top two olmocr comparison: Not enough olmocr candidates")

print("=" * 60)
print("\nCannot perform top two olmocr comparison: Not enough olmocr candidates")

print("=" * 60)


if __name__ == "__main__":
Expand Down
21 changes: 11 additions & 10 deletions olmocr/bench/sample_data/dataset.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@

{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_04", "type": "order", "before": "150.", "after": "157."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_05", "type": "order", "before": "150.", "after": "158."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_06", "type": "order", "before": "150.", "after": "159."}

{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
Expand All @@ -39,20 +42,18 @@
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table08", "type": "table", "cell": "Math proofs code", "left_heading": "Algebraic Stack"}

{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Quadratic regression", "left": "Challenge"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Instrument Use", "left": "Normal"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "Procedure"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "ReACT"}

{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Pick-and-place object", "left_heading": "27"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.66", "right": "0.44"}

{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Interact with a moving agent", "top_heading": "Unit Test Topic"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t01", "type": "table", "cell": "Instrument Use", "left": "Normal"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t02", "type": "table", "cell": "0.87", "top_heading": "Procedure"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t03", "type": "table", "cell": "0.87", "top_heading": "ReACT"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t04", "type": "table", "cell": "Pick-and-place object", "left_heading": "27"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t05", "type": "table", "cell": "0.66", "right": "0.44"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t06", "type": "table", "cell": "Interact with a moving agent", "top_heading": "Unit Test Topic"}

{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table00", "type": "table", "cell": "1,136", "top_heading": "Year Ended"}
{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table01", "type": "table", "cell": "Year Ended"}
{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table02", "type": "table", "cell": "680", "up": "1,892"}
{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table02", "type": "table", "cell": "2,532", "left_heading": "Research and development"}

{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table03", "type": "table", "cell": "2,532", "left_heading": "Research and development"}
{"pdf": "earnings.pdf", "page": 1, "id": "earnings_table04", "type": "absent", "text": "62"}



Loading

0 comments on commit 9be696f

Please sign in to comment.