Skip to content

Commit

Permalink
Conversion fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Mar 5, 2025
1 parent fb0a729 commit 50e55f4
Show file tree
Hide file tree
Showing 74 changed files with 1,107 additions and 121 deletions.
4 changes: 2 additions & 2 deletions olmocr/bench/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,12 @@ async def process_with_semaphore(task):
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
parser.add_argument("--parallel", type=int, default=10, help="Maximum number of concurrent tasks")
parser.add_argument("--parallel", type=int, default=1, help="Maximum number of concurrent tasks")
args = parser.parse_args()

# Mapping of method names to a tuple: (module path, function name)
available_methods = {
"olmocr": ("olmocr.bench.runners.run_olmocr", "run_olmocr"),
"olmocr_pipeline": ("olmocr.bench.runners.run_olmocr_pipeline", "run_olmocr_pipeline"),
"gotocr": ("olmocr.bench.runners.run_gotocr", "run_gotocr"),
"marker": ("olmocr.bench.runners.run_marker", "run_marker"),
"mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
Expand Down
99 changes: 0 additions & 99 deletions olmocr/bench/runners/run_olmocr.py

This file was deleted.

104 changes: 104 additions & 0 deletions olmocr/bench/runners/run_olmocr_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import asyncio
import logging
from dataclasses import dataclass
from typing import Optional

# Import necessary components from olmocr
from olmocr.pipeline import (
MetricsKeeper,
PageResult,
WorkerTracker,
process_page,
sglang_server_host,
sglang_server_ready
)

# Setup basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("olmocr_runner")


# Basic configuration
@dataclass
class Args:
model: str = "allenai/olmOCR-7B-0225-preview"
model_chat_template: str = "qwen2-vl"
model_max_context: int = 8192
target_longest_image_dim: int = 1024
target_anchor_text_len: int = 6000
max_page_retries: int = 8
max_page_error_rate: float = 0.004


async def run_olmocr_pipeline(pdf_path: str, page_num: int = 1) -> Optional[str]:
"""
Process a single page of a PDF using the official olmocr pipeline's process_page function
Args:
pdf_path: Path to the PDF file
page_num: Page number to process (1-indexed)
Returns:
The extracted text from the page or None if processing failed
"""
# Ensure global variables are initialized
global metrics, tracker
if "metrics" not in globals() or metrics is None:
metrics = MetricsKeeper(window=60 * 5)
if "tracker" not in globals() or tracker is None:
tracker = WorkerTracker()

args = Args()
semaphore = asyncio.Semaphore(1)
worker_id = 0 # Using 0 as default worker ID

# Ensure server is running
_server_task = None
try:
await asyncio.wait_for(sglang_server_ready(), timeout=5)
logger.info("Using existing sglang server")
except Exception:
logger.info("Starting new sglang server")
_server_task = asyncio.create_task(sglang_server_host(args, semaphore))
await sglang_server_ready()

try:
# Process the page using the pipeline's process_page function
# Note: process_page expects both original path and local path
# In our case, we're using the same path for both
page_result: PageResult = await process_page(
args=args,
worker_id=worker_id,
pdf_orig_path=pdf_path,
pdf_local_path=pdf_path,
page_num=page_num
)

# Return the natural text from the response
if page_result and page_result.response:
return page_result.response.natural_text
return None

except Exception as e:
logger.error(f"Error processing page: {type(e).__name__} - {str(e)}")
return None

finally:
# We leave the server running for potential reuse
pass


async def main():
# Example usage
pdf_path = "your_pdf_path.pdf"
page_num = 1

result = await run_olmocr_pipeline(pdf_path, page_num)
if result:
print(f"Extracted text: {result[:200]}...") # Print first 200 chars
else:
print("Failed to extract text from the page")


if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Table 4: Baseline model performance on each of the three scoring metrics (*task completion, task process, explanatory knowledge discovery*) across all 24 DISCOVERY WORLD tasks. Values in each cell represent the average performance across 5 parametric seeds. *Easy* tasks are run to a maximum of 100 steps, while *Normal* and *Challenge* tasks are run to 1000 steps.

| # | Topic | Task | ReACT Procedure Completion Knowledge | Plan+Execute Procedure Completion Knowledge | Hypothizer Procedure Completion Knowledge |
|---|---------------------|--------------------|--------------------------------------|-------------------------------------------|------------------------------------------|
| | | | 0.87 0.20 0.20 | 0.89 0.00 0.00 | 0.90 0.40 1.00 |
| 1 | Proteomics | Simple Clustering | 0.87 0.20 0.20 | 0.89 0.00 0.00 | 0.90 0.40 1.00 |
| 2 | | Clustering (2D) | 0.88 0.40 0.40 | 0.68 0.20 0.00 | 0.93 0.40 0.40 |
| 3 | | Clustering (3D) | 0.88 0.40 0.60 | 0.55 0.20 0.00 | 0.93 0.40 0.60 |
| 4 | Chemistry | Exploring Combinations and Hill Climbing | 0.87 1.00 1.00 | 0.70 0.60 0.40 | 0.90 0.00 0.40 |
| 5 | | Single substances | 0.82 0.00 0.00 | 0.87 0.40 0.00 | 0.93 0.60 0.40 |
| 6 | | Mix of 3 substances | 0.82 0.00 0.00 | 0.87 0.40 0.00 | 0.93 0.60 0.40 |
| 7 | Archaeology | Single instrument | 0.27 0.60 0.00 | 0.33 0.20 0.00 | 0.60 0.20 0.50 |
| 8 | | Instrument Use | 0.72 0.40 0.30 | 0.74 0.00 0.00 | 0.64 0.40 0.40 |
| 9 | | Correlation | 0.46 0.20 0.00 | 0.46 0.00 0.05 | 0.55 0.20 0.05 |
| 10 | Reactor Lab | Regression | 0.42 0.00 0.40 | 0.44 0.00 0.10 | 0.38 0.00 0.20 |
| 11 | | Slope only | 0.44 0.00 0.20 | 0.49 0.00 0.00 | 0.51 0.00 0.00 |
| 12 | | Quadratic regression | 0.43 0.00 0.20 | 0.39 0.00 0.00 | 0.39 0.00 0.00 |
| 13 | Plant Nutrients | Uncovering systems of rules | 0.80 0.20 0.20 | 0.70 0.20 0.20 | 0.60 0.00 0.00 |
| 14 | | Presence rules | 0.91 0.60 0.00 | 0.84 0.40 0.00 | 0.56 0.00 0.00 |
| 15 | | Logical Rules | 0.89 0.40 0.00 | 0.73 0.40 0.00 | 0.62 0.00 0.00 |
| 16 | Space Sick | Open-ended discovery | 0.78 0.60 0.00 | 0.68 0.40 0.10 | 0.80 1.00 0.60 |
| 17 | | Single instrument | 0.58 0.00 0.13 | 0.45 0.00 0.13 | 0.16 0.00 0.33 |
| 18 | | Multiple instruments | 0.55 0.00 0.00 | 0.26 0.00 0.00 | 0.20 0.00 0.00 |
| 19 | Rocket Science | Novel instruments | 0.53 0.00 0.00 | 0.34 0.00 0.00 | 0.17 0.00 0.00 |
| 20 | | Look-up variables | 0.51 0.00 0.05 | 0.51 0.00 0.00 | 0.11 0.00 0.00 |
| 21 | | Measure 5 variables | 0.43 0.00 0.00 | 0.34 0.00 0.00 | 0.22 0.00 0.03 |
| 22 | Translation | Rosetta-stone style linguistic discovery of alien language | 0.40 0.40 0.20 | 0.30 0.00 0.00 | 0.20 0.20 0.00 |
| 23 | | Single noun | 0.20 0.00 0.00 | 0.68 0.40 0.00 | 0.84 0.40 0.00 |
| 24 | | Noun and verb | 0.49 0.00 0.00 | 0.55 0.20 0.05 | 0.15 0.00 0.00 |
| | Average (Easy) | | 0.59 0.38 0.25 | 0.56 0.18 0.11 | 0.56 0.28 0.34 |
| | Average (Normal) | | 0.63 0.18 0.14 | 0.64 0.18 0.02 | 0.58 0.23 0.19 |
| | Average (Challenge) | | 0.63 0.18 0.10 | 0.50 0.15 0.01 | 0.49 0.08 0.08 |

Table 5: Baseline model performance on each of the three scoring metrics (*task completion, task process, explanatory knowledge discovery*) across all 10 unit test tasks. Values in each cell represent the average performance across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps.

| # Unit Test Topic | ReACT Procedure Completion | Plan+Execute Procedure Completion | Hypothizer Procedure Completion |
|-------------------|----------------------------|----------------------------------|----------------------------------|
| 25 | Multi-turn dialog with an agent | 1.00 1.00 | 1.00 1.00 | 1.00 1.00 |
| 26 | Measure an object with an instrument | 0.87 0.60 | 0.73 0.40 | 1.00 1.00 |
| 27 | Pick-and-place object | 0.90 0.80 | 0.80 0.60 | 1.00 1.00 |
| 28 | React Discovery Feed posts | 1.00 1.00 | 0.90 0.80 | 1.00 1.00 |
| 30 | Move through doors | 0.58 0.20 | 0.25 0.00 | 0.30 0.00 |
| 31 | Using keys with doors | 0.69 0.20 | 0.54 0.00 | 0.69 0.00 |
| 32 | Navigate to a specific room in a house | 0.20 0.20 | 0.20 0.00 | 0.20 0.20 |
| 33 | Search an environment for an object | 0.80 0.80 | 0.60 0.60 | 1.00 1.00 |
| 34 | Interact with a moving agent | 0.60 0.20 | 0.53 0.00 | 0.53 0.20 |
| | Average (Unit Tests) | 0.76 0.60 | 0.66 0.44 | 0.77 0.64 |

4.2 Baseline Agent Models

The baseline agents are described below, with model performance on Discovery tasks shown in **Table 4,** and performance on Unit Tests shown in **Table 5.** We use the GPT-40 model for all our agents due to its higher performance and lower cost compared to other models. For space we provide
Loading

0 comments on commit 50e55f4

Please sign in to comment.