Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/allenai/olmocr
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Mar 5, 2025
2 parents abeaf02 + 1545a6d commit dbbe6ce
Show file tree
Hide file tree
Showing 20 changed files with 1,127 additions and 53 deletions.
3 changes: 1 addition & 2 deletions olmocr/bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def evaluate_candidate(
if test_avg < 1.0:
test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
f"Ex: {explanations[0] if explanations else 'No explanation'}"
)
test_type_breakdown[test_type].append(test_avg)

Expand Down Expand Up @@ -183,7 +183,6 @@ def main():
else:
status = f"{overall_score * 100:0.1f}%"
print(f"{candidate_name:20s} : Average Score: {overall_score * 100:0.1f}% over {total_tests:3d} tests - {status}")
print(" Breakdown by test type:")
for ttype, scores in test_type_breakdown.items():
if scores:
avg = sum(scores) / len(scores) * 100
Expand Down
23 changes: 17 additions & 6 deletions olmocr/bench/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def parse_method_arg(method_arg):
return name, kwargs, folder_name


async def process_pdfs(config, pdf_directory, data_directory, repeats):
async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
"""Process PDFs with both sync and async functions"""
for candidate in config.keys():
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
Expand All @@ -52,10 +52,21 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
kwargs = config[candidate]["kwargs"]
is_async = asyncio.iscoroutinefunction(method)

for pdf_path in tqdm(glob.glob(os.path.join(pdf_directory, "*.pdf")), desc=candidate):
all_pdfs = glob.glob(os.path.join(pdf_directory, "*.pdf"))
all_pdfs.sort()

for pdf_path in tqdm(all_pdfs, desc=candidate):
base_name = os.path.basename(pdf_path).replace(".pdf", "")

for i in range(1, repeats + 1):
output_filename = f"{base_name}_{i}.md"
output_path = os.path.join(candidate_output_dir, output_filename)

if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue

try:
if is_async:
# Run async function
Expand All @@ -70,8 +81,6 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
print(f"Warning, did not get output for {base_name}_{i}")
continue

output_filename = f"{base_name}_{i}.md"
output_path = os.path.join(candidate_output_dir, output_filename)
with open(output_path, "w") as out_f:
out_f.write(markdown)

Expand All @@ -86,6 +95,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
"Use 'name=folder_name' to specify a custom output folder name.",
)
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
args = parser.parse_args()

# Mapping of method names to a tuple: (module path, function name)
Expand All @@ -109,8 +120,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
function = getattr(module, function_name)
config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}

data_directory = os.path.join(os.path.dirname(__file__), "mining_data")
data_directory = args.dir
pdf_directory = os.path.join(data_directory, "pdfs")

# Run the async process function
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats))
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats, args.force))
20 changes: 12 additions & 8 deletions olmocr/bench/miners/mine_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
best_candidate = c_sentence # Keep original capitalization for output

# Append the candidate if it passes the similarity threshold (e.g., 0.7)
if best_ratio > 0.7 and best_candidate is not None:
if best_ratio > 0.5 and best_candidate is not None:
votes.append(best_candidate.strip())

# Only consider variants that differ when compared case-insensitively
Expand Down Expand Up @@ -191,13 +191,6 @@ def main():

# Collect all .md files from the base and compare folders
base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]

# Read all candidate texts at once
candidate_texts = []
for cf in compare_files:
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
candidate_texts.append(f.read())

all_tests = []

Expand All @@ -207,6 +200,17 @@ def main():
with open(base_file_path, "r", encoding="utf-8") as f:
base_text = f.read()

compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md") and re.sub(r"_\d+\.md$", "", f) == re.sub(r"_\d+\.md$", "", bf)]

if not compare_files:
print(f"skipping {bf} nothing to compare against")

# Read all candidate texts at once
candidate_texts = []
for cf in compare_files:
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
candidate_texts.append(f.read())

base_pdf_file = get_pdf_from_md(base_file_path)
base_pdf_page = 1
print(f"Results for base file: {bf}")
Expand Down
33 changes: 33 additions & 0 deletions olmocr/bench/sample_data/chatgpt/earnings_1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Recently Issued Accounting Pronouncements

Recently Adopted Accounting Pronouncement

In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.

Recent Accounting Pronouncements Not Yet Adopted

In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.

In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.

Note 2 - Business Combination

Termination of the Arm Share Purchase Agreement

In February 2022, NVIDIA and SoftBank Group Corp., or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.

Note 3 - Stock-Based Compensation

Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.

Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:

| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|---------------------|-------------|-------------|-------------|
| | (In millions) | | |
| Cost of revenue | $178 | $141 | $138 |
| Research and development | $3,423 | $2,532 | $1,892 |
| Sales, general and administrative | $1,136 | $876 | $680 |
| Total | $4,737 | $3,549 | $2,710 |

Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
Loading

0 comments on commit dbbe6ce

Please sign in to comment.