mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-26 08:54:01 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
commit
dbbe6cea11
@ -96,7 +96,7 @@ def evaluate_candidate(
|
||||
if test_avg < 1.0:
|
||||
test_failures.append(
|
||||
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
|
||||
f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
|
||||
f"Ex: {explanations[0] if explanations else 'No explanation'}"
|
||||
)
|
||||
test_type_breakdown[test_type].append(test_avg)
|
||||
|
||||
@ -183,7 +183,6 @@ def main():
|
||||
else:
|
||||
status = f"{overall_score * 100:0.1f}%"
|
||||
print(f"{candidate_name:20s} : Average Score: {overall_score * 100:0.1f}% over {total_tests:3d} tests - {status}")
|
||||
print(" Breakdown by test type:")
|
||||
for ttype, scores in test_type_breakdown.items():
|
||||
if scores:
|
||||
avg = sum(scores) / len(scores) * 100
|
||||
|
@ -40,7 +40,7 @@ def parse_method_arg(method_arg):
|
||||
return name, kwargs, folder_name
|
||||
|
||||
|
||||
async def process_pdfs(config, pdf_directory, data_directory, repeats):
|
||||
async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
|
||||
"""Process PDFs with both sync and async functions"""
|
||||
for candidate in config.keys():
|
||||
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
|
||||
@ -52,10 +52,21 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
|
||||
kwargs = config[candidate]["kwargs"]
|
||||
is_async = asyncio.iscoroutinefunction(method)
|
||||
|
||||
for pdf_path in tqdm(glob.glob(os.path.join(pdf_directory, "*.pdf")), desc=candidate):
|
||||
all_pdfs = glob.glob(os.path.join(pdf_directory, "*.pdf"))
|
||||
all_pdfs.sort()
|
||||
|
||||
for pdf_path in tqdm(all_pdfs, desc=candidate):
|
||||
base_name = os.path.basename(pdf_path).replace(".pdf", "")
|
||||
|
||||
for i in range(1, repeats + 1):
|
||||
output_filename = f"{base_name}_{i}.md"
|
||||
output_path = os.path.join(candidate_output_dir, output_filename)
|
||||
|
||||
if os.path.exists(output_path) and not force:
|
||||
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
|
||||
print("Rerun with --force flag to force regeneration")
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_async:
|
||||
# Run async function
|
||||
@ -70,8 +81,6 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
|
||||
print(f"Warning, did not get output for {base_name}_{i}")
|
||||
continue
|
||||
|
||||
output_filename = f"{base_name}_{i}.md"
|
||||
output_path = os.path.join(candidate_output_dir, output_filename)
|
||||
with open(output_path, "w") as out_f:
|
||||
out_f.write(markdown)
|
||||
|
||||
@ -86,6 +95,8 @@ if __name__ == "__main__":
|
||||
"Use 'name=folder_name' to specify a custom output folder name.",
|
||||
)
|
||||
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
|
||||
parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
|
||||
parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Mapping of method names to a tuple: (module path, function name)
|
||||
@ -109,8 +120,8 @@ if __name__ == "__main__":
|
||||
function = getattr(module, function_name)
|
||||
config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}
|
||||
|
||||
data_directory = os.path.join(os.path.dirname(__file__), "mining_data")
|
||||
data_directory = args.dir
|
||||
pdf_directory = os.path.join(data_directory, "pdfs")
|
||||
|
||||
# Run the async process function
|
||||
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats))
|
||||
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats, args.force))
|
||||
|
@ -119,7 +119,7 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
|
||||
best_candidate = c_sentence # Keep original capitalization for output
|
||||
|
||||
# Append the candidate if it passes the similarity threshold (e.g., 0.7)
|
||||
if best_ratio > 0.7 and best_candidate is not None:
|
||||
if best_ratio > 0.5 and best_candidate is not None:
|
||||
votes.append(best_candidate.strip())
|
||||
|
||||
# Only consider variants that differ when compared case-insensitively
|
||||
@ -191,13 +191,6 @@ def main():
|
||||
|
||||
# Collect all .md files from the base and compare folders
|
||||
base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
|
||||
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]
|
||||
|
||||
# Read all candidate texts at once
|
||||
candidate_texts = []
|
||||
for cf in compare_files:
|
||||
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
|
||||
candidate_texts.append(f.read())
|
||||
|
||||
all_tests = []
|
||||
|
||||
@ -207,6 +200,17 @@ def main():
|
||||
with open(base_file_path, "r", encoding="utf-8") as f:
|
||||
base_text = f.read()
|
||||
|
||||
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md") and re.sub(r"_\d+\.md$", "", f) == re.sub(r"_\d+\.md$", "", bf)]
|
||||
|
||||
if not compare_files:
|
||||
print(f"skipping {bf} nothing to compare against")
|
||||
|
||||
# Read all candidate texts at once
|
||||
candidate_texts = []
|
||||
for cf in compare_files:
|
||||
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
|
||||
candidate_texts.append(f.read())
|
||||
|
||||
base_pdf_file = get_pdf_from_md(base_file_path)
|
||||
base_pdf_page = 1
|
||||
print(f"Results for base file: {bf}")
|
||||
|
33
olmocr/bench/sample_data/chatgpt/earnings_1.md
Normal file
33
olmocr/bench/sample_data/chatgpt/earnings_1.md
Normal file
@ -0,0 +1,33 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
|
||||
In February 2022, NVIDIA and SoftBank Group Corp., or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
|---------------------|-------------|-------------|-------------|
|
||||
| | (In millions) | | |
|
||||
| Cost of revenue | $178 | $141 | $138 |
|
||||
| Research and development | $3,423 | $2,532 | $1,892 |
|
||||
| Sales, general and administrative | $1,136 | $876 | $680 |
|
||||
| Total | $4,737 | $3,549 | $2,710 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
@ -1,29 +1,52 @@
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?"}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers."}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "max_diffs": 2}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental"}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/"}
|
||||
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "max_diffs": 2}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "max_diffs": 2}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "max_diffs": 2}
|
||||
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 0.99}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide"}
|
||||
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "The-Stack-V2"}
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "SE, whatever we've scraped"}
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "HQ DCLM"}
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "checked": "verified", "text": "The-Stack-V2"}
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "checked": "verified", "text": "SE, whatever we've scraped"}
|
||||
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "checked": "verified", "text": "HQ DCLM"}
|
||||
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "January 10th 1864."}
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "checked": "verified", "text": "January 10th 1864."}
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
|
||||
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
|
||||
|
||||
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
|
||||
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
|
||||
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
|
||||
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
|
||||
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "Type"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table01", "type": "table", "cell": "3.32T", "left": "3.71T"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table02", "type": "table", "cell": "3.32T", "right": "21.32T"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table03", "type": "table", "cell": "11.8B", "up": "12.2B"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table04", "type": "table", "cell": "11.8B", "down": "3.7B"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table05", "type": "table", "cell": "3.32T", "top_heading": "Words"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table06", "type": "table", "cell": "arXiv", "top_heading": "Source"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table07", "type": "table", "cell": "47.2B", "top_heading": "Bytes"}
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table08", "type": "table", "cell": "Math proofs code", "left_heading": "Algebraic Stack"}
|
||||
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Quadratic regression", "left": "Challenge"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Instrument Use", "left": "Normal"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "Procedure"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "ReACT"}
|
||||
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Pick-and-place object", "left_heading": "27"}
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.66", "right": "0.44"}
|
||||
|
||||
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Interact with a moving agent", "top_heading": "Unit Test Topic"}
|
||||
|
||||
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
|
||||
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
|
||||
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
|
||||
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
|
||||
|
||||
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
|
55
olmocr/bench/sample_data/gotocr/earnings_1.md
Normal file
55
olmocr/bench/sample_data/gotocr/earnings_1.md
Normal file
@ -0,0 +1,55 @@
|
||||
Table of Contents
|
||||
NVIDIA Corporation and Subsidiaries
|
||||
Notes to the Consolidated Financial Statements
|
||||
(Continued)
|
||||
Recently Issued Accounting Pronouncements
|
||||
Recently Adopted Accounting Pronouncement
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in
|
||||
operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in
|
||||
the financial statements, and the financial statement of further information.
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of
|
||||
information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the
|
||||
adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis,
|
||||
including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within
|
||||
each income statement expense option, as applicable. We expect to adopt this standard in our fiscal year 2025 annual report. We do not expect the adoption of
|
||||
the net assets of the Company in connection our Consolidated Financial Statements other than additional disclosures.
|
||||
Note 2 - Business Combination
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have
|
||||
acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We
|
||||
recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 feeling the write-off of the prepayment provided at signing.
|
||||
Note 3 - Stock-Based Compensation
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
Consolidated Statements of income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost
|
||||
of revenue, as follows:
|
||||
Year Ended
|
||||
Jan 26, 2025
|
||||
Jan 28, 2024
|
||||
Jan 29, 2023
|
||||
(In millions)
|
||||
Cost of revenue
|
||||
$
|
||||
178
|
||||
$
|
||||
141
|
||||
$
|
||||
138
|
||||
Research and development
|
||||
3,423
|
||||
2,532
|
||||
1,092
|
||||
Sales, general and administrative
|
||||
1,136
|
||||
676
|
||||
680
|
||||
Total
|
||||
$
|
||||
4,737
|
||||
$
|
||||
3,549
|
||||
$
|
||||
2,710
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
||||
62
|
40
olmocr/bench/sample_data/marker/earnings_1.md
Normal file
40
olmocr/bench/sample_data/marker/earnings_1.md
Normal file
@ -0,0 +1,40 @@
|
||||
### **Table of Contents**
|
||||
|
||||
**NVIDIA Corporation and Subsidiaries Notes to the Consolidated Financial Statements** (Continued)
|
||||
|
||||
**Recently Issued Accounting Pronouncements**
|
||||
|
||||
#### **Recently Adopted Accounting Pronouncement**
|
||||
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
#### **Recent Accounting Pronouncements Not Yet Adopted**
|
||||
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
# **Note 2 - Business Combination**
|
||||
|
||||
#### **Termination of the Arm Share Purchase Agreement**
|
||||
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of \$1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
# **Note 3 - Stock-Based Compensation**
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| | Year Ended | | | | | |
|
||||
|-----------------------------------|--------------|----|---------------|----|--------------|--|
|
||||
| | Jan 26, 2025 | | Jan 28, 2024 | | Jan 29, 2023 | |
|
||||
| | | | (In millions) | | | |
|
||||
| Cost of revenue | \$<br>178 | \$ | 141 | \$ | 138 | |
|
||||
| Research and development | 3,423 | | 2,532 | | 1,892 | |
|
||||
| Sales, general and administrative | 1,136 | | 876 | | 680 | |
|
||||
| Total | \$<br>4,737 | \$ | 3,549 | \$ | 2,710 | |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
||||
|
||||
## 62
|
32
olmocr/bench/sample_data/olmocr/earnings_1.md
Normal file
32
olmocr/bench/sample_data/olmocr/earnings_1.md
Normal file
@ -0,0 +1,32 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
|---------------------|-------------|-------------|-------------|
|
||||
| Cost of revenue | $178 | $141 | $138 |
|
||||
| Research and development | 3,423 | 2,532 | 1,892 |
|
||||
| Sales, general and administrative | 1,136 | 876 | 680 |
|
||||
| Total | $4,737 | $3,549 | $2,710 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
33
olmocr/bench/sample_data/olmocr/earnings_2.md
Normal file
33
olmocr/bench/sample_data/olmocr/earnings_2.md
Normal file
@ -0,0 +1,33 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| | Year Ended |
|
||||
|------------------------|------------|
|
||||
| | Jan 29, 2023 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
| Cost of revenue | $ 178 | $ 141 | $ 138 |
|
||||
| Research and development | 3,423 | 2,532 | 1,892 |
|
||||
| Sales, general and administrative | 1,136 | 876 | 680 |
|
||||
| Total | $ 4,737 | $ 3,549 | $ 2,710 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
33
olmocr/bench/sample_data/olmocr/earnings_3.md
Normal file
33
olmocr/bench/sample_data/olmocr/earnings_3.md
Normal file
@ -0,0 +1,33 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| | Year Ended |
|
||||
|----------------------|---------------------|
|
||||
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
| Cost of revenue | $ 4,737 | $ 3,549 | $ 2,710 |
|
||||
| Research and development | 3,423 | 2,532 | 1,892 |
|
||||
| Sales, general and administrative | 1,136 | 876 | 680 |
|
||||
| Total | $ 9,300 | $ 6,997 | $ 5,282 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
29
olmocr/bench/sample_data/olmocr/earnings_4.md
Normal file
29
olmocr/bench/sample_data/olmocr/earnings_4.md
Normal file
@ -0,0 +1,29 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
|----------------|-------------|-------------|-------------|
|
||||
| Cost of revenue| $3,549 | $3,423 | $2,532 |
|
||||
| Research and development| $1,892 | $2,710 | $1,136 |
|
||||
| Sales, general and administrative| $138 | $141 | $178 |
|
||||
| Total | $4,737 | $4,774 | $3,549 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
29
olmocr/bench/sample_data/olmocr/earnings_5.md
Normal file
29
olmocr/bench/sample_data/olmocr/earnings_5.md
Normal file
@ -0,0 +1,29 @@
|
||||
Recently Issued Accounting Pronouncements
|
||||
|
||||
Recently Adopted Accounting Pronouncement
|
||||
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
|
||||
|
||||
Recent Accounting Pronouncements Not Yet Adopted
|
||||
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
|
||||
|
||||
Note 2 - Business Combination
|
||||
|
||||
Termination of the Arm Share Purchase Agreement
|
||||
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
|
||||
|
||||
Note 3 - Stock-Based Compensation
|
||||
|
||||
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
|
||||
|
||||
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
|
||||
|
||||
| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|
||||
|------------|--------------|--------------|--------------|
|
||||
| Cost of revenue | $4,737 (In millions) | $3,549 | $2,710 |
|
||||
| Research and development | 3,423 | 2,532 | 1,892 |
|
||||
| Sales, general and administrative | 1,136 | 876 | 680 |
|
||||
| Total | $4,737 | $3,549 | $2,710 |
|
||||
|
||||
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
|
BIN
olmocr/bench/sample_data/pdfs/earnings.pdf
Normal file
BIN
olmocr/bench/sample_data/pdfs/earnings.pdf
Normal file
Binary file not shown.
18
olmocr/bench/scripts/convert_all.sh
Normal file
18
olmocr/bench/scripts/convert_all.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Assuming olmocr env already exists
|
||||
source activate olmocr
|
||||
python -m olmocr.bench.convert olmocr --repeats 5
|
||||
|
||||
pip install marker-pdf
|
||||
python -m olmocr.bench.convert marker
|
||||
|
||||
pip install verovio
|
||||
python -m olmocr.bench.convert gotocr
|
||||
|
||||
python -m olmocr.bench.convert chatgpt
|
||||
|
||||
|
||||
#python -m olmocr.bench.convert mineru
|
0
olmocr/bench/synth/__init__.py
Normal file
0
olmocr/bench/synth/__init__.py
Normal file
182
olmocr/bench/synth/render.py
Normal file
182
olmocr/bench/synth/render.py
Normal file
@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# Simple configuration
|
||||
CONFIG = {
|
||||
"input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"), # React component file
|
||||
"output_pdf": "book-page.pdf", # Output PDF filename
|
||||
"temp_html": "temp-render.html", # Temporary HTML file
|
||||
"wait_time": 1500, # Time to wait for rendering (ms)
|
||||
"device_scale": 2, # Resolution multiplier
|
||||
"debug": True # Keep temp files for debugging
|
||||
}
|
||||
|
||||
async def create_html_file():
|
||||
"""Create a temporary HTML file that loads the React component from a file."""
|
||||
try:
|
||||
# Check if input file exists
|
||||
input_path = Path(CONFIG["input_file"])
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file '{input_path}' not found")
|
||||
return False
|
||||
|
||||
# Read the component file
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
component_code = f.read()
|
||||
|
||||
# Create HTML that will load our component
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Book Page Template</title>
|
||||
<script src="https://unpkg.com/react@17/umd/react.development.js"></script>
|
||||
<script src="https://unpkg.com/react-dom@17/umd/react-dom.development.js"></script>
|
||||
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
|
||||
<style>
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
html, body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
width: 8.5in;
|
||||
height: 11in;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
#root {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
padding: 0.25in;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
@media print {
|
||||
body {
|
||||
-webkit-print-color-adjust: exact;
|
||||
print-color-adjust: exact;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
|
||||
<script type="text/babel">
|
||||
// The React component code loaded from external file
|
||||
""" + component_code + """
|
||||
|
||||
// Render only the book page part, not the controls
|
||||
ReactDOM.render(
|
||||
<BookPageTemplate />,
|
||||
document.getElementById('root')
|
||||
);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open(CONFIG["temp_html"], 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"Created HTML file: {CONFIG['temp_html']}")
|
||||
print(f"Using React component from: {CONFIG['input_file']}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error creating HTML file: {e}")
|
||||
print(f"Exception details: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
async def render_to_pdf():
|
||||
"""Render the React component to PDF using Playwright."""
|
||||
try:
|
||||
# Create the HTML file first
|
||||
html_created = await create_html_file()
|
||||
if not html_created:
|
||||
print("Failed to create HTML file")
|
||||
return
|
||||
|
||||
print("Launching browser...")
|
||||
async with async_playwright() as p:
|
||||
# Launch the browser with more debugging options
|
||||
browser = await p.chromium.launch(
|
||||
headless=True, # True for production, False for debugging
|
||||
)
|
||||
|
||||
# Create a new page for letter size paper
|
||||
page = await browser.new_page(
|
||||
viewport={"width": 816, "height": 1056}, # 8.5in x 11in at 96dpi
|
||||
device_scale_factor=CONFIG["device_scale"]
|
||||
)
|
||||
|
||||
# Get absolute path to HTML file
|
||||
html_path = Path(CONFIG["temp_html"]).absolute()
|
||||
html_uri = f"file://{html_path}"
|
||||
|
||||
print(f"Navigating to: {html_uri}")
|
||||
|
||||
# Add event listeners for console messages and errors
|
||||
page.on("console", lambda msg: print(f"Browser console: {msg.text}"))
|
||||
page.on("pageerror", lambda err: print(f"Browser page error: {err}"))
|
||||
|
||||
# Navigate with longer timeout and wait for network idle
|
||||
await page.goto(html_uri, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# Wait for React to render
|
||||
await page.wait_for_timeout(CONFIG["wait_time"])
|
||||
|
||||
# Add a check to ensure the component rendered
|
||||
element_count = await page.evaluate("""() => {
|
||||
const root = document.getElementById('root');
|
||||
return root.childElementCount;
|
||||
}""")
|
||||
|
||||
if element_count == 0:
|
||||
print("Warning: No elements found in root. Component may not have rendered.")
|
||||
else:
|
||||
print(f"Found {element_count} elements in root. Component rendered successfully.")
|
||||
|
||||
# Save debug screenshot
|
||||
if CONFIG["debug"]:
|
||||
await page.screenshot(path="debug-screenshot.png")
|
||||
print("Debug screenshot saved")
|
||||
|
||||
# Generate PDF
|
||||
print("Generating PDF...")
|
||||
await page.pdf(
|
||||
path=CONFIG["output_pdf"],
|
||||
format="Letter",
|
||||
print_background=True,
|
||||
margin={"top": "0", "right": "0", "bottom": "0", "left": "0"}
|
||||
)
|
||||
|
||||
print(f"PDF generated successfully: {CONFIG['output_pdf']}")
|
||||
|
||||
# Close the browser
|
||||
await browser.close()
|
||||
|
||||
# Cleanup temp files if not in debug mode
|
||||
if not CONFIG["debug"] and Path(CONFIG["temp_html"]).exists():
|
||||
Path(CONFIG["temp_html"]).unlink()
|
||||
print("Temporary HTML file removed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating PDF: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the async function
|
||||
try:
|
||||
asyncio.run(render_to_pdf())
|
||||
except Exception as e:
|
||||
print(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
214
olmocr/bench/synth/templates/bookpage.js
Normal file
214
olmocr/bench/synth/templates/bookpage.js
Normal file
@ -0,0 +1,214 @@
|
||||
//import React from 'react';
|
||||
|
||||
const BookPageTemplate = () => {
|
||||
// Only three state variables as requested
|
||||
const [title, setTitle] = React.useState("ADVENTURES OF DON QUIXOTE");
|
||||
const [pageNumber, setPageNumber] = React.useState("289");
|
||||
const [text, setText] = React.useState(
|
||||
"deed,\" said Don Quixote, \"thou hast hit the point, Sancho, which can alone shake my resolution; I neither can, nor ought to, draw my sword, as I have often told thee, against those who are not dubbed knights. To thee which I had premeditated, thy share of the booty would have been at least the emperor's crown of gold and Cupid's painted wings; for I would have plucked them off perforce, and delivered them into thy hands.\" \"The"
|
||||
);
|
||||
|
||||
// Styles for heavily degraded scan effect
|
||||
const heavilyDegradedStyles = {
|
||||
filter: 'grayscale(30%) contrast(120%) brightness(85%) sepia(20%)',
|
||||
position: 'relative',
|
||||
backgroundColor: '#e6ddc6', // More yellowed aged paper
|
||||
backgroundImage: 'url("data:image/svg+xml,%3Csvg viewBox=\'0 0 200 200\' xmlns=\'http://www.w3.org/2000/svg\'%3E%3Cfilter id=\'noiseFilter\'%3E%3CfeTurbulence type=\'fractalNoise\' baseFrequency=\'0.85\' numOctaves=\'3\' stitchTiles=\'stitch\'/%3E%3C/filter%3E%3Crect width=\'100%25\' height=\'100%25\' filter=\'url(%23noiseFilter)\' opacity=\'0.25\'/%3E%3C/svg%3E")',
|
||||
boxShadow: 'inset 0 0 70px rgba(0, 0, 0, 0.3), 0 0 5px rgba(0,0,0,0.1)',
|
||||
padding: '32px',
|
||||
borderRadius: '2px',
|
||||
overflow: 'hidden',
|
||||
transform: 'rotate(0.3deg)', // Slightly askew scan
|
||||
};
|
||||
|
||||
// Heavily degraded text
|
||||
const badScanTextStyle = {
|
||||
fontFamily: '"Times New Roman", serif',
|
||||
letterSpacing: '-0.01em',
|
||||
wordSpacing: '0.02em',
|
||||
fontWeight: '500',
|
||||
color: '#222222',
|
||||
textShadow: '0 0 1px rgba(0, 0, 0, 0.5)',
|
||||
transform: 'scale(1.01, 0.99) rotate(-0.4deg)', // Distorted proportions
|
||||
};
|
||||
|
||||
// Random coffee stain effect
|
||||
const coffeeStain = {
|
||||
position: 'absolute',
|
||||
width: '100px',
|
||||
height: '80px',
|
||||
top: '25%',
|
||||
right: '15%',
|
||||
borderRadius: '50%',
|
||||
background: 'radial-gradient(ellipse at center, rgba(139,69,19,0.15) 0%, rgba(139,69,19,0.1) 50%, rgba(139,69,19,0.05) 70%, rgba(139,69,19,0) 100%)',
|
||||
transform: 'rotate(30deg) scale(1.5, 1)',
|
||||
pointerEvents: 'none',
|
||||
zIndex: 1,
|
||||
};
|
||||
|
||||
// Water damage effect
|
||||
const waterDamage = {
|
||||
position: 'absolute',
|
||||
width: '70%',
|
||||
height: '40%',
|
||||
bottom: '10%',
|
||||
left: '5%',
|
||||
opacity: 0.07,
|
||||
background: 'radial-gradient(ellipse at center, rgba(0,0,0,0.2) 0%, rgba(0,0,0,0.1) 40%, rgba(0,0,0,0) 70%)',
|
||||
borderRadius: '40% 60% 70% 30% / 40% 50% 60% 50%',
|
||||
pointerEvents: 'none',
|
||||
zIndex: 1,
|
||||
};
|
||||
|
||||
// Add fold lines
|
||||
const foldLine = {
|
||||
position: 'absolute',
|
||||
width: '100%',
|
||||
height: '3px',
|
||||
top: '30%',
|
||||
left: 0,
|
||||
background: 'linear-gradient(to right, rgba(0,0,0,0) 0%, rgba(0,0,0,0.03) 20%, rgba(0,0,0,0.08) 50%, rgba(0,0,0,0.03) 80%, rgba(0,0,0,0) 100%)',
|
||||
boxShadow: '0 1px 3px rgba(255,255,255,0.2)',
|
||||
pointerEvents: 'none',
|
||||
zIndex: 2,
|
||||
};
|
||||
|
||||
// Torn edge effect
|
||||
const tornEdge = {
|
||||
position: 'absolute',
|
||||
top: 0,
|
||||
right: 0,
|
||||
width: '100%',
|
||||
height: '100%',
|
||||
background: 'linear-gradient(135deg, transparent 97%, #e6ddc6 97%, #e6ddc6 100%)',
|
||||
pointerEvents: 'none',
|
||||
};
|
||||
|
||||
return (
|
||||
<div style={{
|
||||
maxWidth: '800px',
|
||||
margin: '0 auto',
|
||||
padding: '16px',
|
||||
}}>
|
||||
{/* Heavily degraded scan container */}
|
||||
<div style={heavilyDegradedStyles}>
|
||||
{/* Noise overlay */}
|
||||
<div style={{
|
||||
position: 'absolute',
|
||||
top: 0,
|
||||
left: 0,
|
||||
right: 0,
|
||||
bottom: 0,
|
||||
background: 'url("") repeat',
|
||||
opacity: 0.15,
|
||||
pointerEvents: 'none',
|
||||
}}></div>
|
||||
|
||||
{/* Scan lines effect */}
|
||||
<div style={{
|
||||
position: 'absolute',
|
||||
top: 0,
|
||||
left: 0,
|
||||
right: 0,
|
||||
bottom: 0,
|
||||
background: 'linear-gradient(to bottom, rgba(0,0,0,0.03) 1px, transparent 1px)',
|
||||
backgroundSize: '100% 2px',
|
||||
opacity: 0.5,
|
||||
pointerEvents: 'none',
|
||||
}}></div>
|
||||
|
||||
{/* Add coffee stain */}
|
||||
<div style={coffeeStain}></div>
|
||||
|
||||
{/* Add water damage */}
|
||||
<div style={waterDamage}></div>
|
||||
|
||||
{/* Add fold line */}
|
||||
<div style={foldLine}></div>
|
||||
|
||||
{/* Add torn edge */}
|
||||
<div style={tornEdge}></div>
|
||||
|
||||
{/* Header with skewed alignment */}
|
||||
<div style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
borderBottom: '2px solid #000',
|
||||
paddingBottom: '4px',
|
||||
marginBottom: '24px',
|
||||
position: 'relative',
|
||||
opacity: 0.8,
|
||||
transform: 'skew(-0.5deg, 0.3deg)',
|
||||
}}>
|
||||
<div style={{width: '48px'}}></div>
|
||||
<h1 style={{
|
||||
...badScanTextStyle,
|
||||
fontSize: '20px',
|
||||
fontWeight: 'bold',
|
||||
textAlign: 'center',
|
||||
textTransform: 'uppercase',
|
||||
letterSpacing: '1px',
|
||||
opacity: 0.8,
|
||||
}}>{title}</h1>
|
||||
<div style={{
|
||||
...badScanTextStyle,
|
||||
fontSize: '20px',
|
||||
fontWeight: 'bold',
|
||||
opacity: 0.85,
|
||||
}}>{pageNumber}</div>
|
||||
</div>
|
||||
|
||||
{/* Horizontal divider with uneven quality */}
|
||||
<div style={{
|
||||
borderBottom: '1px solid #444',
|
||||
marginBottom: '24px',
|
||||
opacity: 0.6,
|
||||
filter: 'blur(0.3px)',
|
||||
transform: 'scaleY(1.5) skew(0.7deg)',
|
||||
}}></div>
|
||||
|
||||
{/* Text content with severely degraded appearance */}
|
||||
<div style={{
|
||||
columnCount: 2,
|
||||
columnGap: '20px',
|
||||
columnRule: '1px solid rgba(0,0,0,0.1)',
|
||||
textAlign: 'justify',
|
||||
...badScanTextStyle,
|
||||
fontSize: '16px',
|
||||
lineHeight: '1.5',
|
||||
opacity: 0.78,
|
||||
// Very uneven ink distribution with blurry and faded parts
|
||||
WebkitMaskImage: 'linear-gradient(to bottom, rgba(0,0,0,0.9), rgba(0,0,0,0.75) 50%, rgba(0,0,0,0.85))',
|
||||
// Text distortion
|
||||
filter: 'blur(0.2px)',
|
||||
}}>
|
||||
{/* Bad scan text with random character fading */}
|
||||
<p>{text.split('').map((char, index) => {
|
||||
const opacity = Math.random() > 0.8 ? 0.4 + Math.random() * 0.5 : 0.9 + Math.random() * 0.1;
|
||||
const blur = Math.random() > 0.95 ? 1 : 0;
|
||||
return <span key={index} style={{opacity, filter: `blur(${blur}px)`}}>{char}</span>;
|
||||
})}</p>
|
||||
</div>
|
||||
|
||||
{/* Extra random ink spill */}
|
||||
<div style={{
|
||||
position: 'absolute',
|
||||
width: '10px',
|
||||
height: '20px',
|
||||
top: '60%',
|
||||
left: '25%',
|
||||
background: 'rgba(0,0,0,0.3)',
|
||||
borderRadius: '50%',
|
||||
transform: 'rotate(30deg)',
|
||||
filter: 'blur(1px)',
|
||||
zIndex: 3,
|
||||
}}></div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
//export default BookPageTemplate;
|
||||
window.BookPageTemplate = BookPageTemplate;
|
83
olmocr/bench/synth/templates/listpage.js
Normal file
83
olmocr/bench/synth/templates/listpage.js
Normal file
@ -0,0 +1,83 @@
|
||||
//import React from 'react';
|
||||
|
||||
const PermitGuidelinesTemplate = () => {
|
||||
// Sample data - you can replace these with your own
|
||||
const guidelineItems = [
|
||||
{
|
||||
number: 'iii.',
|
||||
content: 'Not rely on personal preference or opinion, or regional interpretation of statute, regulation or guidance that is inconsistent with the Department\'s statewide interpretation. Staff should confer with the appropriate Bureau Director as necessary.'
|
||||
},
|
||||
{
|
||||
number: 'iv.',
|
||||
content: 'Process technically adequate and scientifically sound applications for final approval to minimize elapsed time in accordance with the Permit Decision Guarantee.'
|
||||
},
|
||||
{
|
||||
number: 'v.',
|
||||
content: 'Where the Application Manager determines that the technical information submitted with the application does not meet technical guidance or standards published by the Department, the application must provide the scientific or engineering basis to support the application. Note that deviations from technical guidance can generally be approved, by the appropriate section chief and manager, when warranted, provided acceptable justification has been submitted. Minor deficiencies that can be easily corrected should be addressed through a telephone call with the applicant and consultant, and may negate the need for a deficiency letter. The Program Manager or District Manager will be responsible for making that decision.'
|
||||
},
|
||||
{
|
||||
number: 'vi.',
|
||||
content: 'If an application fails to provide the technical information necessary to document that applicable regulatory and statutory requirements will be achieved, it is technically deficient and the Application Manager will prepare a technical deficiency letter. Again, all deficiencies noted must cite the statutory or regulatory obligation that the application has failed to meet and the Section Chief and the Program Manager will routinely review these letters. For District Oil and Gas Offices and District Mining Offices the Permits Chief and the Manager will review the letters.'
|
||||
},
|
||||
{
|
||||
number: 'vii.',
|
||||
content: 'Applicant responses that do not make the application technically adequate within the established response timeframe will be subject to the Elevated Review Process below. Applications that are made technically adequate within the established response timeframe will proceed to processing for final action.'
|
||||
}
|
||||
];
|
||||
|
||||
// Footnote data
|
||||
const footnote = {
|
||||
number: '2',
|
||||
content: 'More technically complex projects and applications may receive additional deficiency letters as appropriate prior to a decision point. This exception will not void inclusion in the Permit Decision Guarantee and will follow program specific guidance that is developed. The more technically complex projects and applications are noted with an asterisk ("*") in Appendix A.'
|
||||
};
|
||||
|
||||
// Document info
|
||||
const documentInfo = "021-2100-001 / November 2, 2012 / Page 11";
|
||||
|
||||
// Special note about technical deficiency letter
|
||||
const technicalDeficiencyNote = {
|
||||
prefix: 'One',
|
||||
superscript: '2',
|
||||
content: ' technical deficiency letter will be sent. Each deficiency cited must note the statute, regulation or technical guidance provision. Technical guidance provides a means to compliance, but may not be used or cited when issuing a permit denial. The letter will state, as necessary, that the Permit Decision Guarantee is no longer applicable and offer the applicant an opportunity to meet and discuss the deficiencies. The letter will include a deadline for submission of the deficient information.'
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="bg-white p-8 max-w-4xl mx-auto font-serif text-black">
|
||||
<div className="mb-8">
|
||||
{guidelineItems.map((item, index) => (
|
||||
<div key={index} className="mb-6 flex">
|
||||
<div className="w-12 flex-shrink-0 font-bold">{item.number}</div>
|
||||
<div className="flex-grow">{item.content}</div>
|
||||
</div>
|
||||
))}
|
||||
|
||||
{/* Technical deficiency letter note */}
|
||||
<div className="mb-6 ml-12">
|
||||
<p>
|
||||
{technicalDeficiencyNote.prefix}
|
||||
<sup>{technicalDeficiencyNote.superscript}</sup>
|
||||
{technicalDeficiencyNote.content}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Horizontal line */}
|
||||
<div className="border-t border-gray-400 my-6"></div>
|
||||
|
||||
{/* Footnote section */}
|
||||
<div className="text-sm">
|
||||
<p>
|
||||
<sup>{footnote.number}</sup> {footnote.content}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Document info */}
|
||||
<div className="text-center mt-6 text-sm">
|
||||
{documentInfo}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
//export default PermitGuidelinesTemplate;
|
||||
window.BookPageTemplate = PermitGuidelinesTemplate;
|
@ -1,16 +1,112 @@
|
||||
import json
|
||||
import re
|
||||
import numpy as np
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Dict, Any
|
||||
|
||||
from fuzzysearch import find_near_matches
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
|
||||
def parse_markdown_tables(md_content: str) -> List[np.ndarray]:
|
||||
"""
|
||||
Extract and parse all markdown tables from the provided content.
|
||||
|
||||
Args:
|
||||
md_content: The markdown content containing tables
|
||||
|
||||
Returns:
|
||||
A list of numpy arrays, each representing a parsed table
|
||||
"""
|
||||
# Extract all tables from markdown
|
||||
table_pattern = r'(\|(?:[^|]*\|)+)\s*\n\|(?:[:-]+\|)+\s*\n((?:\|(?:[^|]*\|)+\s*\n)+)'
|
||||
table_matches = re.finditer(table_pattern, md_content)
|
||||
|
||||
parsed_tables = []
|
||||
|
||||
for table_match in table_matches:
|
||||
# Extract header and body from the table match
|
||||
header_row = table_match.group(1).strip()
|
||||
body_rows = table_match.group(2).strip().split('\n')
|
||||
|
||||
# Process header and rows to remove leading/trailing |
|
||||
header_cells = [cell.strip() for cell in header_row.split('|')]
|
||||
if header_cells[0] == '':
|
||||
header_cells = header_cells[1:]
|
||||
if header_cells[-1] == '':
|
||||
header_cells = header_cells[:-1]
|
||||
|
||||
# Process table body rows
|
||||
table_data = []
|
||||
for row in [header_row] + body_rows:
|
||||
if '|' not in row: # Skip separator row
|
||||
continue
|
||||
|
||||
cells = [cell.strip() for cell in row.split('|')]
|
||||
if cells[0] == '':
|
||||
cells = cells[1:]
|
||||
if cells[-1] == '':
|
||||
cells = cells[:-1]
|
||||
|
||||
table_data.append(cells)
|
||||
|
||||
# Skip separator row (second row with dashes)
|
||||
if len(table_data) > 1 and all('-' in cell for cell in table_data[1]):
|
||||
table_data = [table_data[0]] + table_data[2:]
|
||||
|
||||
# Convert to numpy array for easier manipulation
|
||||
# First ensure all rows have the same number of columns by padding if necessary
|
||||
max_cols = max(len(row) for row in table_data)
|
||||
padded_data = [row + [''] * (max_cols - len(row)) for row in table_data]
|
||||
table_array = np.array(padded_data)
|
||||
|
||||
parsed_tables.append(table_array)
|
||||
|
||||
return parsed_tables
|
||||
|
||||
|
||||
def parse_html_tables(html_content: str) -> List[np.ndarray]:
|
||||
"""
|
||||
Extract and parse all HTML tables from the provided content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content containing tables
|
||||
|
||||
Returns:
|
||||
A list of numpy arrays, each representing a parsed table
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
tables = soup.find_all('table')
|
||||
|
||||
parsed_tables = []
|
||||
|
||||
for table in tables:
|
||||
rows = table.find_all(['tr'])
|
||||
table_data = []
|
||||
|
||||
for row in rows:
|
||||
cells = row.find_all(['th', 'td'])
|
||||
row_data = [cell.get_text().strip() for cell in cells]
|
||||
table_data.append(row_data)
|
||||
|
||||
# Ensure all rows have the same number of columns
|
||||
if table_data:
|
||||
max_cols = max(len(row) for row in table_data)
|
||||
padded_data = [row + [''] * (max_cols - len(row)) for row in table_data]
|
||||
table_array = np.array(padded_data)
|
||||
parsed_tables.append(table_array)
|
||||
|
||||
return parsed_tables
|
||||
|
||||
|
||||
class TestType(str, Enum):
|
||||
PRESENT = "present"
|
||||
ABSENT = "absent"
|
||||
ORDER = "order"
|
||||
TABLE = "table"
|
||||
|
||||
|
||||
class TestChecked(str, Enum):
|
||||
@ -41,18 +137,16 @@ class BasePDFTest:
|
||||
page: int
|
||||
id: str
|
||||
type: str
|
||||
threshold: float = 1.0
|
||||
max_diffs: int = 0
|
||||
checked: Optional[TestChecked] = None
|
||||
|
||||
def __post_init__(self):
|
||||
self.threshold = float(self.threshold)
|
||||
|
||||
if not self.pdf:
|
||||
raise ValidationError("PDF filename cannot be empty")
|
||||
if not self.id:
|
||||
raise ValidationError("Test ID cannot be empty")
|
||||
if not isinstance(self.threshold, float) or not (0 <= self.threshold <= 1):
|
||||
raise ValidationError(f"Threshold must be a float between 0 and 1, got {self.threshold}")
|
||||
if not isinstance(self.max_diffs, int) or self.max_diffs < 0:
|
||||
raise ValidationError(f"Max diffs must be positive number or 0")
|
||||
if self.type not in {t.value for t in TestType}:
|
||||
raise ValidationError(f"Invalid test type: {self.type}")
|
||||
|
||||
@ -90,7 +184,8 @@ class TextPresenceTest(BasePDFTest):
|
||||
|
||||
def run(self, md_content: str) -> Tuple[bool, str]:
|
||||
reference_query = self.text
|
||||
threshold = self.threshold
|
||||
# Threshold for fuzzy matching derived from max_diffs
|
||||
threshold = 1.0 - (self.max_diffs / (len(reference_query) if len(reference_query) > 0 else 1))
|
||||
best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0
|
||||
|
||||
if self.type == TestType.PRESENT.value:
|
||||
@ -130,15 +225,13 @@ class TextOrderTest(BasePDFTest):
|
||||
raise ValidationError("After field cannot be empty")
|
||||
|
||||
def run(self, md_content: str) -> Tuple[bool, str]:
|
||||
threshold = self.threshold
|
||||
max_l_dist = round((1.0 - threshold) * len(self.before))
|
||||
before_matches = find_near_matches(self.before, md_content, max_l_dist=max_l_dist)
|
||||
after_matches = find_near_matches(self.after, md_content, max_l_dist=max_l_dist)
|
||||
before_matches = find_near_matches(self.before, md_content, max_l_dist=self.max_diffs)
|
||||
after_matches = find_near_matches(self.after, md_content, max_l_dist=self.max_diffs)
|
||||
|
||||
if not before_matches:
|
||||
return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {max_l_dist}"
|
||||
return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {self.max_diffs}"
|
||||
if not after_matches:
|
||||
return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {max_l_dist}"
|
||||
return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {self.max_diffs}"
|
||||
|
||||
for before_match in before_matches:
|
||||
for after_match in after_matches:
|
||||
@ -147,6 +240,167 @@ class TextOrderTest(BasePDFTest):
|
||||
return False, (f"Could not find a location where '{self.before[:40]}...' appears before " f"'{self.after[:40]}...'.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableTest(BasePDFTest):
|
||||
"""
|
||||
Test to verify certain properties of a table are held, namely that some cells appear relative to other cells correctly
|
||||
"""
|
||||
# This is the target cell, which must exist in at least one place in the table
|
||||
cell: str
|
||||
|
||||
# These properties say that the cell immediately up/down/left/right of the target cell has the string specified
|
||||
up: str = ""
|
||||
down: str = ""
|
||||
left: str = ""
|
||||
right: str = ""
|
||||
|
||||
# These properties say that the cell all the way up, or all the way left of the target cell (ex. headings) has the string value specified
|
||||
top_heading: str = ""
|
||||
left_heading: str = ""
|
||||
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
if self.type != TestType.TABLE.value:
|
||||
raise ValidationError(f"Invalid type for TableTest: {self.type}")
|
||||
|
||||
def run(self, content: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Run the table test on provided content.
|
||||
|
||||
Finds all tables (markdown and/or HTML based on content_type) and checks if any cell
|
||||
matches the target cell and satisfies the specified relationships.
|
||||
|
||||
Args:
|
||||
content: The content containing tables (markdown or HTML)
|
||||
|
||||
Returns:
|
||||
A tuple (passed, explanation) where 'passed' is True if the test passes,
|
||||
and 'explanation' provides details when the test fails.
|
||||
"""
|
||||
# Initialize variables to track tables and results
|
||||
tables_to_check = []
|
||||
failed_reasons = []
|
||||
|
||||
# Threshold for fuzzy matching derived from max_diffs
|
||||
threshold = 1.0 - (self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1))
|
||||
|
||||
# Parse tables based on content_type
|
||||
md_tables = parse_markdown_tables(content)
|
||||
tables_to_check.extend(md_tables)
|
||||
|
||||
html_tables = parse_html_tables(content)
|
||||
tables_to_check.extend(html_tables)
|
||||
|
||||
# If no tables found, return failure
|
||||
if not tables_to_check:
|
||||
return False, f"No tables found in the content at all"
|
||||
|
||||
# Check each table
|
||||
for table_array in tables_to_check:
|
||||
# Find all cells that match the target cell using fuzzy matching
|
||||
matches = []
|
||||
for i in range(table_array.shape[0]):
|
||||
for j in range(table_array.shape[1]):
|
||||
cell_content = table_array[i, j]
|
||||
similarity = fuzz.ratio(self.cell, cell_content) / 100.0
|
||||
|
||||
if similarity >= threshold:
|
||||
matches.append((i, j))
|
||||
|
||||
# If no matches found in this table, continue to the next table
|
||||
if not matches:
|
||||
continue
|
||||
|
||||
# Check the relationships for each matching cell
|
||||
for row_idx, col_idx in matches:
|
||||
all_relationships_satisfied = True
|
||||
current_failed_reasons = []
|
||||
|
||||
# Check up relationship
|
||||
if self.up and row_idx > 0:
|
||||
up_cell = table_array[row_idx - 1, col_idx]
|
||||
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
|
||||
if up_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})")
|
||||
|
||||
# Check down relationship
|
||||
if self.down and row_idx < table_array.shape[0] - 1:
|
||||
down_cell = table_array[row_idx + 1, col_idx]
|
||||
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
|
||||
if down_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})")
|
||||
|
||||
# Check left relationship
|
||||
if self.left and col_idx > 0:
|
||||
left_cell = table_array[row_idx, col_idx - 1]
|
||||
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
|
||||
if left_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})")
|
||||
|
||||
# Check right relationship
|
||||
if self.right and col_idx < table_array.shape[1] - 1:
|
||||
right_cell = table_array[row_idx, col_idx + 1]
|
||||
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
|
||||
if right_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})")
|
||||
|
||||
# Check top heading relationship
|
||||
if self.top_heading and row_idx > 0:
|
||||
# Find the first non-empty cell in the same column (starting from the top)
|
||||
top_heading_cell = ""
|
||||
for i in range(row_idx):
|
||||
if table_array[i, col_idx].strip():
|
||||
top_heading_cell = table_array[i, col_idx]
|
||||
break
|
||||
|
||||
if not top_heading_cell:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"No non-empty top heading found in column {col_idx}")
|
||||
else:
|
||||
top_similarity = fuzz.ratio(self.top_heading, top_heading_cell) / 100.0
|
||||
if top_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Top heading '{top_heading_cell}' doesn't match expected '{self.top_heading}' (similarity: {top_similarity:.2f})")
|
||||
|
||||
# Check left heading relationship
|
||||
if self.left_heading and col_idx > 0:
|
||||
# Find the first non-empty cell in the same row (starting from the left)
|
||||
left_heading_cell = ""
|
||||
for j in range(col_idx):
|
||||
if table_array[row_idx, j].strip():
|
||||
left_heading_cell = table_array[row_idx, j]
|
||||
break
|
||||
|
||||
if not left_heading_cell:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"No non-empty left heading found in row {row_idx}")
|
||||
else:
|
||||
left_heading_similarity = fuzz.ratio(self.left_heading, left_heading_cell) / 100.0
|
||||
if left_heading_similarity < threshold:
|
||||
all_relationships_satisfied = False
|
||||
current_failed_reasons.append(f"Left heading '{left_heading_cell}' doesn't match expected '{self.left_heading}' (similarity: {left_heading_similarity:.2f})")
|
||||
|
||||
# If all relationships are satisfied for this cell, the test passes
|
||||
if all_relationships_satisfied:
|
||||
return True, ""
|
||||
else:
|
||||
failed_reasons.extend(current_failed_reasons)
|
||||
|
||||
# If we've gone through all tables and all matching cells and none satisfied all relationships
|
||||
if not failed_reasons:
|
||||
return False, f"No cell matching '{self.cell}' found in any table with threshold {threshold}"
|
||||
else:
|
||||
return False, f"Found cells matching '{self.cell}' but relationships were not satisfied: {'; '.join(failed_reasons)}"
|
||||
|
||||
|
||||
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
||||
"""
|
||||
Load tests from a JSONL file.
|
||||
@ -171,6 +425,8 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
||||
test = TextPresenceTest(**data)
|
||||
elif test_type == TestType.ORDER.value:
|
||||
test = TextOrderTest(**data)
|
||||
elif test_type == TestType.TABLE.value:
|
||||
test = TableTest(**data)
|
||||
else:
|
||||
raise ValidationError(f"Unknown test type: {test_type}")
|
||||
|
||||
@ -195,4 +451,4 @@ def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
|
||||
"""
|
||||
with open(jsonl_file, "w") as file:
|
||||
for test in tests:
|
||||
file.write(json.dumps(asdict(test)) + "\n")
|
||||
file.write(json.dumps(asdict(test)) + "\n")
|
@ -643,7 +643,7 @@ async def sglang_server_ready():
|
||||
else:
|
||||
logger.info(f"Attempt {attempt}: Unexpected status code {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Attempt {attempt}: {e}")
|
||||
logger.warning(f"Attempt {attempt}: Please wait for sglang server to become ready...")
|
||||
|
||||
await asyncio.sleep(delay_sec)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user