Merge branch 'main' of https://github.com/allenai/olmocr

allenai · Mar 5, 2025 · dbbe6ce · dbbe6ce
2 parents abeaf02 + 1545a6d
commit dbbe6ce
Show file tree

Hide file tree

Showing 20 changed files with 1,127 additions and 53 deletions.
diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py
@@ -96,7 +96,7 @@ def evaluate_candidate(
         if test_avg < 1.0:
             test_failures.append(
                 f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
-                f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
+                f"Ex: {explanations[0] if explanations else 'No explanation'}"
             )
         test_type_breakdown[test_type].append(test_avg)
 
@@ -183,7 +183,6 @@ def main():
         else:
             status = f"{overall_score * 100:0.1f}%"
         print(f"{candidate_name:20s} : Average Score: {overall_score * 100:0.1f}% over {total_tests:3d} tests - {status}")
-        print("  Breakdown by test type:")
         for ttype, scores in test_type_breakdown.items():
             if scores:
                 avg = sum(scores) / len(scores) * 100

diff --git a/olmocr/bench/convert.py b/olmocr/bench/convert.py
@@ -40,7 +40,7 @@ def parse_method_arg(method_arg):
     return name, kwargs, folder_name
 
 
-async def process_pdfs(config, pdf_directory, data_directory, repeats):
+async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
     """Process PDFs with both sync and async functions"""
     for candidate in config.keys():
         print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
@@ -52,10 +52,21 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
         kwargs = config[candidate]["kwargs"]
         is_async = asyncio.iscoroutinefunction(method)
 
-        for pdf_path in tqdm(glob.glob(os.path.join(pdf_directory, "*.pdf")), desc=candidate):
+        all_pdfs = glob.glob(os.path.join(pdf_directory, "*.pdf"))
+        all_pdfs.sort()
+
+        for pdf_path in tqdm(all_pdfs, desc=candidate):
             base_name = os.path.basename(pdf_path).replace(".pdf", "")
 
             for i in range(1, repeats + 1):
+                output_filename = f"{base_name}_{i}.md"
+                output_path = os.path.join(candidate_output_dir, output_filename)
+
+                if os.path.exists(output_path) and not force:
+                    print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
+                    print("Rerun with --force flag to force regeneration")
+                    continue
+
                 try:
                     if is_async:
                         # Run async function
@@ -70,8 +81,6 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
                     print(f"Warning, did not get output for {base_name}_{i}")
                     continue
 
-                output_filename = f"{base_name}_{i}.md"
-                output_path = os.path.join(candidate_output_dir, output_filename)
                 with open(output_path, "w") as out_f:
                     out_f.write(markdown)
 
@@ -86,6 +95,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
         "Use 'name=folder_name' to specify a custom output folder name.",
     )
     parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
+    parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
+    parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
     args = parser.parse_args()
 
     # Mapping of method names to a tuple: (module path, function name)
@@ -109,8 +120,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
         function = getattr(module, function_name)
         config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}
 
-    data_directory = os.path.join(os.path.dirname(__file__), "mining_data")
+    data_directory = args.dir
     pdf_directory = os.path.join(data_directory, "pdfs")
 
     # Run the async process function
-    asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats))
+    asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats, args.force))
diff --git a/olmocr/bench/miners/mine_diffs.py b/olmocr/bench/miners/mine_diffs.py
@@ -119,7 +119,7 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
                     best_candidate = c_sentence  # Keep original capitalization for output
 
             # Append the candidate if it passes the similarity threshold (e.g., 0.7)
-            if best_ratio > 0.7 and best_candidate is not None:
+            if best_ratio > 0.5 and best_candidate is not None:
                 votes.append(best_candidate.strip())
 
         # Only consider variants that differ when compared case-insensitively
@@ -191,13 +191,6 @@ def main():
 
     # Collect all .md files from the base and compare folders
     base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
-    compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]
-
-    # Read all candidate texts at once
-    candidate_texts = []
-    for cf in compare_files:
-        with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
-            candidate_texts.append(f.read())
 
     all_tests = []
 
@@ -207,6 +200,17 @@ def main():
         with open(base_file_path, "r", encoding="utf-8") as f:
             base_text = f.read()
 
+        compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md") and re.sub(r"_\d+\.md$", "", f) == re.sub(r"_\d+\.md$", "", bf)]
+
+        if not compare_files:
+            print(f"skipping {bf} nothing to compare against")
+
+        # Read all candidate texts at once
+        candidate_texts = []
+        for cf in compare_files:
+            with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
+                candidate_texts.append(f.read())
+
         base_pdf_file = get_pdf_from_md(base_file_path)
         base_pdf_page = 1
         print(f"Results for base file: {bf}")

diff --git a/olmocr/bench/sample_data/chatgpt/earnings_1.md b/olmocr/bench/sample_data/chatgpt/earnings_1.md
@@ -0,0 +1,33 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+
+In February 2022, NVIDIA and SoftBank Group Corp., or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| Year Ended          | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+|---------------------|-------------|-------------|-------------|
+|                     | (In millions) |             |             |
+| Cost of revenue     | $178        | $141        | $138        |
+| Research and development | $3,423      | $2,532      | $1,892      |
+| Sales, general and administrative | $1,136      | $876        | $680        |
+| Total               | $4,737      | $3,549      | $2,710      |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.