diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py
index ed3408c..3fce2b7 100644
--- a/olmocr/bench/benchmark.py
+++ b/olmocr/bench/benchmark.py
@@ -96,7 +96,7 @@ def evaluate_candidate(
if test_avg < 1.0:
test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
- f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
+ f"Ex: {explanations[0] if explanations else 'No explanation'}"
)
test_type_breakdown[test_type].append(test_avg)
@@ -183,7 +183,6 @@ def main():
else:
status = f"{overall_score * 100:0.1f}%"
print(f"{candidate_name:20s} : Average Score: {overall_score * 100:0.1f}% over {total_tests:3d} tests - {status}")
- print(" Breakdown by test type:")
for ttype, scores in test_type_breakdown.items():
if scores:
avg = sum(scores) / len(scores) * 100
diff --git a/olmocr/bench/convert.py b/olmocr/bench/convert.py
index 72d4092..ba6174a 100644
--- a/olmocr/bench/convert.py
+++ b/olmocr/bench/convert.py
@@ -40,7 +40,7 @@ def parse_method_arg(method_arg):
return name, kwargs, folder_name
-async def process_pdfs(config, pdf_directory, data_directory, repeats):
+async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
"""Process PDFs with both sync and async functions"""
for candidate in config.keys():
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
@@ -52,10 +52,21 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
kwargs = config[candidate]["kwargs"]
is_async = asyncio.iscoroutinefunction(method)
- for pdf_path in tqdm(glob.glob(os.path.join(pdf_directory, "*.pdf")), desc=candidate):
+ all_pdfs = glob.glob(os.path.join(pdf_directory, "*.pdf"))
+ all_pdfs.sort()
+
+ for pdf_path in tqdm(all_pdfs, desc=candidate):
base_name = os.path.basename(pdf_path).replace(".pdf", "")
for i in range(1, repeats + 1):
+ output_filename = f"{base_name}_{i}.md"
+ output_path = os.path.join(candidate_output_dir, output_filename)
+
+ if os.path.exists(output_path) and not force:
+ print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
+ print("Rerun with --force flag to force regeneration")
+ continue
+
try:
if is_async:
# Run async function
@@ -70,8 +81,6 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
print(f"Warning, did not get output for {base_name}_{i}")
continue
- output_filename = f"{base_name}_{i}.md"
- output_path = os.path.join(candidate_output_dir, output_filename)
with open(output_path, "w") as out_f:
out_f.write(markdown)
@@ -86,6 +95,8 @@ if __name__ == "__main__":
"Use 'name=folder_name' to specify a custom output folder name.",
)
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
+ parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
+ parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
args = parser.parse_args()
# Mapping of method names to a tuple: (module path, function name)
@@ -109,8 +120,8 @@ if __name__ == "__main__":
function = getattr(module, function_name)
config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}
- data_directory = os.path.join(os.path.dirname(__file__), "mining_data")
+ data_directory = args.dir
pdf_directory = os.path.join(data_directory, "pdfs")
# Run the async process function
- asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats))
+ asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats, args.force))
diff --git a/olmocr/bench/miners/mine_diffs.py b/olmocr/bench/miners/mine_diffs.py
index aaea6db..e096867 100644
--- a/olmocr/bench/miners/mine_diffs.py
+++ b/olmocr/bench/miners/mine_diffs.py
@@ -119,7 +119,7 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
best_candidate = c_sentence # Keep original capitalization for output
# Append the candidate if it passes the similarity threshold (e.g., 0.7)
- if best_ratio > 0.7 and best_candidate is not None:
+ if best_ratio > 0.5 and best_candidate is not None:
votes.append(best_candidate.strip())
# Only consider variants that differ when compared case-insensitively
@@ -191,13 +191,6 @@ def main():
# Collect all .md files from the base and compare folders
base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
- compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]
-
- # Read all candidate texts at once
- candidate_texts = []
- for cf in compare_files:
- with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
- candidate_texts.append(f.read())
all_tests = []
@@ -207,6 +200,17 @@ def main():
with open(base_file_path, "r", encoding="utf-8") as f:
base_text = f.read()
+ compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md") and re.sub(r"_\d+\.md$", "", f) == re.sub(r"_\d+\.md$", "", bf)]
+
+ if not compare_files:
+ print(f"skipping {bf} nothing to compare against")
+
+ # Read all candidate texts at once
+ candidate_texts = []
+ for cf in compare_files:
+ with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
+ candidate_texts.append(f.read())
+
base_pdf_file = get_pdf_from_md(base_file_path)
base_pdf_page = 1
print(f"Results for base file: {bf}")
diff --git a/olmocr/bench/sample_data/chatgpt/earnings_1.md b/olmocr/bench/sample_data/chatgpt/earnings_1.md
new file mode 100644
index 0000000..e41f830
--- /dev/null
+++ b/olmocr/bench/sample_data/chatgpt/earnings_1.md
@@ -0,0 +1,33 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+
+In February 2022, NVIDIA and SoftBank Group Corp., or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+|---------------------|-------------|-------------|-------------|
+| | (In millions) | | |
+| Cost of revenue | $178 | $141 | $138 |
+| Research and development | $3,423 | $2,532 | $1,892 |
+| Sales, general and administrative | $1,136 | $876 | $680 |
+| Total | $4,737 | $3,549 | $2,710 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/dataset.jsonl b/olmocr/bench/sample_data/dataset.jsonl
index 608ec79..3b4d397 100644
--- a/olmocr/bench/sample_data/dataset.jsonl
+++ b/olmocr/bench/sample_data/dataset.jsonl
@@ -1,29 +1,52 @@
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?"}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers."}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "max_diffs": 2}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental"}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/"}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "max_diffs": 2}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "max_diffs": 2}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "max_diffs": 2}
-{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
-{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
-{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 0.99}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide"}
-{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "The-Stack-V2"}
-{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "SE, whatever we've scraped"}
-{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "HQ DCLM"}
+{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "checked": "verified", "text": "The-Stack-V2"}
+{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "checked": "verified", "text": "SE, whatever we've scraped"}
+{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "checked": "verified", "text": "HQ DCLM"}
-{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "January 10th 1864."}
-{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
-{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
+{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "checked": "verified", "text": "January 10th 1864."}
+{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
+{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
+
+{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
+{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
+
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
+{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
+
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
+
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "Type"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table01", "type": "table", "cell": "3.32T", "left": "3.71T"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table02", "type": "table", "cell": "3.32T", "right": "21.32T"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table03", "type": "table", "cell": "11.8B", "up": "12.2B"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table04", "type": "table", "cell": "11.8B", "down": "3.7B"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table05", "type": "table", "cell": "3.32T", "top_heading": "Words"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table06", "type": "table", "cell": "arXiv", "top_heading": "Source"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table07", "type": "table", "cell": "47.2B", "top_heading": "Bytes"}
+{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table08", "type": "table", "cell": "Math proofs code", "left_heading": "Algebraic Stack"}
+
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Quadratic regression", "left": "Challenge"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Instrument Use", "left": "Normal"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "Procedure"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "ReACT"}
+
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Pick-and-place object", "left_heading": "27"}
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.66", "right": "0.44"}
+
+{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Interact with a moving agent", "top_heading": "Unit Test Topic"}
-{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
-{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
-{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
-{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/gotocr/earnings_1.md b/olmocr/bench/sample_data/gotocr/earnings_1.md
new file mode 100644
index 0000000..8d2f04d
--- /dev/null
+++ b/olmocr/bench/sample_data/gotocr/earnings_1.md
@@ -0,0 +1,55 @@
+Table of Contents
+NVIDIA Corporation and Subsidiaries
+Notes to the Consolidated Financial Statements
+(Continued)
+Recently Issued Accounting Pronouncements
+Recently Adopted Accounting Pronouncement
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in
+operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in
+the financial statements, and the financial statement of further information.
+Recent Accounting Pronouncements Not Yet Adopted
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of
+information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the
+adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis,
+including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within
+each income statement expense option, as applicable. We expect to adopt this standard in our fiscal year 2025 annual report. We do not expect the adoption of
+the net assets of the Company in connection our Consolidated Financial Statements other than additional disclosures.
+Note 2 - Business Combination
+Termination of the Arm Share Purchase Agreement
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have
+acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We
+recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 feeling the write-off of the prepayment provided at signing.
+Note 3 - Stock-Based Compensation
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+Consolidated Statements of income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost
+of revenue, as follows:
+Year Ended
+Jan 26, 2025
+Jan 28, 2024
+Jan 29, 2023
+(In millions)
+Cost of revenue
+$
+178
+$
+141
+$
+138
+Research and development
+3,423
+2,532
+1,092
+Sales, general and administrative
+1,136
+676
+680
+Total
+$
+4,737
+$
+3,549
+$
+2,710
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
+62
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/marker/earnings_1.md b/olmocr/bench/sample_data/marker/earnings_1.md
new file mode 100644
index 0000000..a092262
--- /dev/null
+++ b/olmocr/bench/sample_data/marker/earnings_1.md
@@ -0,0 +1,40 @@
+### **Table of Contents**
+
+**NVIDIA Corporation and Subsidiaries Notes to the Consolidated Financial Statements** (Continued)
+
+**Recently Issued Accounting Pronouncements**
+
+#### **Recently Adopted Accounting Pronouncement**
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+#### **Recent Accounting Pronouncements Not Yet Adopted**
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+# **Note 2 - Business Combination**
+
+#### **Termination of the Arm Share Purchase Agreement**
+
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of \$1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+# **Note 3 - Stock-Based Compensation**
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| | Year Ended | | | | | |
+|-----------------------------------|--------------|----|---------------|----|--------------|--|
+| | Jan 26, 2025 | | Jan 28, 2024 | | Jan 29, 2023 | |
+| | | | (In millions) | | | |
+| Cost of revenue | \$
178 | \$ | 141 | \$ | 138 | |
+| Research and development | 3,423 | | 2,532 | | 1,892 | |
+| Sales, general and administrative | 1,136 | | 876 | | 680 | |
+| Total | \$
4,737 | \$ | 3,549 | \$ | 2,710 | |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
+
+## 62
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/earnings_1.md b/olmocr/bench/sample_data/olmocr/earnings_1.md
new file mode 100644
index 0000000..ae1deab
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/earnings_1.md
@@ -0,0 +1,32 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+|---------------------|-------------|-------------|-------------|
+| Cost of revenue | $178 | $141 | $138 |
+| Research and development | 3,423 | 2,532 | 1,892 |
+| Sales, general and administrative | 1,136 | 876 | 680 |
+| Total | $4,737 | $3,549 | $2,710 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/earnings_2.md b/olmocr/bench/sample_data/olmocr/earnings_2.md
new file mode 100644
index 0000000..624fff9
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/earnings_2.md
@@ -0,0 +1,33 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| | Year Ended |
+|------------------------|------------|
+| | Jan 29, 2023 | Jan 28, 2024 | Jan 29, 2023 |
+| Cost of revenue | $ 178 | $ 141 | $ 138 |
+| Research and development | 3,423 | 2,532 | 1,892 |
+| Sales, general and administrative | 1,136 | 876 | 680 |
+| Total | $ 4,737 | $ 3,549 | $ 2,710 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/earnings_3.md b/olmocr/bench/sample_data/olmocr/earnings_3.md
new file mode 100644
index 0000000..de43e19
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/earnings_3.md
@@ -0,0 +1,33 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| | Year Ended |
+|----------------------|---------------------|
+| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+| Cost of revenue | $ 4,737 | $ 3,549 | $ 2,710 |
+| Research and development | 3,423 | 2,532 | 1,892 |
+| Sales, general and administrative | 1,136 | 876 | 680 |
+| Total | $ 9,300 | $ 6,997 | $ 5,282 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/earnings_4.md b/olmocr/bench/sample_data/olmocr/earnings_4.md
new file mode 100644
index 0000000..8ade8cc
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/earnings_4.md
@@ -0,0 +1,29 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+|----------------|-------------|-------------|-------------|
+| Cost of revenue| $3,549 | $3,423 | $2,532 |
+| Research and development| $1,892 | $2,710 | $1,136 |
+| Sales, general and administrative| $138 | $141 | $178 |
+| Total | $4,737 | $4,774 | $3,549 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/earnings_5.md b/olmocr/bench/sample_data/olmocr/earnings_5.md
new file mode 100644
index 0000000..e0d98f4
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/earnings_5.md
@@ -0,0 +1,29 @@
+Recently Issued Accounting Pronouncements
+
+Recently Adopted Accounting Pronouncement
+In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
+
+Recent Accounting Pronouncements Not Yet Adopted
+In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
+
+Note 2 - Business Combination
+
+Termination of the Arm Share Purchase Agreement
+In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
+
+Note 3 - Stock-Based Compensation
+
+Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
+
+Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
+
+| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
+|------------|--------------|--------------|--------------|
+| Cost of revenue | $4,737 (In millions) | $3,549 | $2,710 |
+| Research and development | 3,423 | 2,532 | 1,892 |
+| Sales, general and administrative | 1,136 | 876 | 680 |
+| Total | $4,737 | $3,549 | $2,710 |
+
+Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/pdfs/earnings.pdf b/olmocr/bench/sample_data/pdfs/earnings.pdf
new file mode 100644
index 0000000..3ebcf86
Binary files /dev/null and b/olmocr/bench/sample_data/pdfs/earnings.pdf differ
diff --git a/olmocr/bench/scripts/convert_all.sh b/olmocr/bench/scripts/convert_all.sh
new file mode 100644
index 0000000..13bbb8c
--- /dev/null
+++ b/olmocr/bench/scripts/convert_all.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -e
+
+# Assuming olmocr env already exists
+source activate olmocr
+python -m olmocr.bench.convert olmocr --repeats 5
+
+pip install marker-pdf
+python -m olmocr.bench.convert marker
+
+pip install verovio
+python -m olmocr.bench.convert gotocr
+
+python -m olmocr.bench.convert chatgpt
+
+
+#python -m olmocr.bench.convert mineru
\ No newline at end of file
diff --git a/olmocr/bench/synth/__init__.py b/olmocr/bench/synth/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/olmocr/bench/synth/render.py b/olmocr/bench/synth/render.py
new file mode 100644
index 0000000..551cf01
--- /dev/null
+++ b/olmocr/bench/synth/render.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+import os
+import asyncio
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+# Simple configuration
+CONFIG = {
+ "input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"), # React component file
+ "output_pdf": "book-page.pdf", # Output PDF filename
+ "temp_html": "temp-render.html", # Temporary HTML file
+ "wait_time": 1500, # Time to wait for rendering (ms)
+ "device_scale": 2, # Resolution multiplier
+ "debug": True # Keep temp files for debugging
+}
+
+async def create_html_file():
+ """Create a temporary HTML file that loads the React component from a file."""
+ try:
+ # Check if input file exists
+ input_path = Path(CONFIG["input_file"])
+ if not input_path.exists():
+ print(f"Error: Input file '{input_path}' not found")
+ return False
+
+ # Read the component file
+ with open(input_path, 'r', encoding='utf-8') as f:
+ component_code = f.read()
+
+ # Create HTML that will load our component
+ html_content = """
+
+
+
{text.split('').map((char, index) => { + const opacity = Math.random() > 0.8 ? 0.4 + Math.random() * 0.5 : 0.9 + Math.random() * 0.1; + const blur = Math.random() > 0.95 ? 1 : 0; + return {char}; + })}
++ {technicalDeficiencyNote.prefix} + {technicalDeficiencyNote.superscript} + {technicalDeficiencyNote.content} +
++ {footnote.number} {footnote.content} +
+