Skip to content

Commit

Permalink
Adding basic table relative tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Mar 4, 2025
1 parent 76476f9 commit 748fd62
Show file tree
Hide file tree
Showing 5 changed files with 290 additions and 34 deletions.
2 changes: 1 addition & 1 deletion olmocr/bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def evaluate_candidate(
if test_avg < 1.0:
test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
f"Ex: {explanations[0] if explanations else 'No explanation'}"
)
test_type_breakdown[test_type].append(test_avg)

Expand Down
51 changes: 31 additions & 20 deletions olmocr/bench/sample_data/dataset.jsonl
Original file line number Diff line number Diff line change
@@ -1,29 +1,40 @@
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?"}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "max_diffs": 2}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental"}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/"}

{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "max_diffs": 2}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}

{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 0.99}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide"}

{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "The-Stack-V2"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "SE, whatever we've scraped"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "HQ DCLM"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "checked": "verified", "text": "The-Stack-V2"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "checked": "verified", "text": "SE, whatever we've scraped"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "checked": "verified", "text": "HQ DCLM"}

{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "January 10th 1864."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "checked": "verified", "text": "January 10th 1864."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}

{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}

{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}

{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}

{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "Type"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "3.32T", "left": "3.71T"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "3.32T", "right": "21.32T"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "11.8B", "up": "12.2B"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "11.8B", "down": "3.7B"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "3.32T", "top_heading": "Words"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "arXiv", "top_heading": "Source"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "47.2B", "top_heading": "Bytes"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "Math proofs code", "left_heading": "Algebraic Stack"}

{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
2 changes: 1 addition & 1 deletion olmocr/bench/synth/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Simple configuration
CONFIG = {
"input_file": os.path.join(os.path.dirname(__file__), "templates", "bookpage.js"), # React component file
"input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"), # React component file
"output_pdf": "book-page.pdf", # Output PDF filename
"temp_html": "temp-render.html", # Temporary HTML file
"wait_time": 1500, # Time to wait for rendering (ms)
Expand Down
83 changes: 83 additions & 0 deletions olmocr/bench/synth/templates/listpage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
//import React from 'react';

const PermitGuidelinesTemplate = () => {
// Sample data - you can replace these with your own
const guidelineItems = [
{
number: 'iii.',
content: 'Not rely on personal preference or opinion, or regional interpretation of statute, regulation or guidance that is inconsistent with the Department\'s statewide interpretation. Staff should confer with the appropriate Bureau Director as necessary.'
},
{
number: 'iv.',
content: 'Process technically adequate and scientifically sound applications for final approval to minimize elapsed time in accordance with the Permit Decision Guarantee.'
},
{
number: 'v.',
content: 'Where the Application Manager determines that the technical information submitted with the application does not meet technical guidance or standards published by the Department, the application must provide the scientific or engineering basis to support the application. Note that deviations from technical guidance can generally be approved, by the appropriate section chief and manager, when warranted, provided acceptable justification has been submitted. Minor deficiencies that can be easily corrected should be addressed through a telephone call with the applicant and consultant, and may negate the need for a deficiency letter. The Program Manager or District Manager will be responsible for making that decision.'
},
{
number: 'vi.',
content: 'If an application fails to provide the technical information necessary to document that applicable regulatory and statutory requirements will be achieved, it is technically deficient and the Application Manager will prepare a technical deficiency letter. Again, all deficiencies noted must cite the statutory or regulatory obligation that the application has failed to meet and the Section Chief and the Program Manager will routinely review these letters. For District Oil and Gas Offices and District Mining Offices the Permits Chief and the Manager will review the letters.'
},
{
number: 'vii.',
content: 'Applicant responses that do not make the application technically adequate within the established response timeframe will be subject to the Elevated Review Process below. Applications that are made technically adequate within the established response timeframe will proceed to processing for final action.'
}
];

// Footnote data
const footnote = {
number: '2',
content: 'More technically complex projects and applications may receive additional deficiency letters as appropriate prior to a decision point. This exception will not void inclusion in the Permit Decision Guarantee and will follow program specific guidance that is developed. The more technically complex projects and applications are noted with an asterisk ("*") in Appendix A.'
};

// Document info
const documentInfo = "021-2100-001 / November 2, 2012 / Page 11";

// Special note about technical deficiency letter
const technicalDeficiencyNote = {
prefix: 'One',
superscript: '2',
content: ' technical deficiency letter will be sent. Each deficiency cited must note the statute, regulation or technical guidance provision. Technical guidance provides a means to compliance, but may not be used or cited when issuing a permit denial. The letter will state, as necessary, that the Permit Decision Guarantee is no longer applicable and offer the applicant an opportunity to meet and discuss the deficiencies. The letter will include a deadline for submission of the deficient information.'
};

return (
<div className="bg-white p-8 max-w-4xl mx-auto font-serif text-black">
<div className="mb-8">
{guidelineItems.map((item, index) => (
<div key={index} className="mb-6 flex">
<div className="w-12 flex-shrink-0 font-bold">{item.number}</div>
<div className="flex-grow">{item.content}</div>
</div>
))}

{/* Technical deficiency letter note */}
<div className="mb-6 ml-12">
<p>
{technicalDeficiencyNote.prefix}
<sup>{technicalDeficiencyNote.superscript}</sup>
{technicalDeficiencyNote.content}
</p>
</div>
</div>

{/* Horizontal line */}
<div className="border-t border-gray-400 my-6"></div>

{/* Footnote section */}
<div className="text-sm">
<p>
<sup>{footnote.number}</sup> {footnote.content}
</p>
</div>

{/* Document info */}
<div className="text-center mt-6 text-sm">
{documentInfo}
</div>
</div>
);
};

//export default PermitGuidelinesTemplate;
window.BookPageTemplate = PermitGuidelinesTemplate;
Loading

0 comments on commit 748fd62

Please sign in to comment.