This commit is contained in:
Jake Poznanski 2025-03-05 19:37:10 +00:00
commit dbbe6cea11
20 changed files with 1127 additions and 53 deletions

View File

@ -96,7 +96,7 @@ def evaluate_candidate(
if test_avg < 1.0:
test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
f"Example explanation: {explanations[0] if explanations else 'No explanation'}"
f"Ex: {explanations[0] if explanations else 'No explanation'}"
)
test_type_breakdown[test_type].append(test_avg)
@ -183,7 +183,6 @@ def main():
else:
status = f"{overall_score * 100:0.1f}%"
print(f"{candidate_name:20s} : Average Score: {overall_score * 100:0.1f}% over {total_tests:3d} tests - {status}")
print(" Breakdown by test type:")
for ttype, scores in test_type_breakdown.items():
if scores:
avg = sum(scores) / len(scores) * 100

View File

@ -40,7 +40,7 @@ def parse_method_arg(method_arg):
return name, kwargs, folder_name
async def process_pdfs(config, pdf_directory, data_directory, repeats):
async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
"""Process PDFs with both sync and async functions"""
for candidate in config.keys():
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
@ -52,10 +52,21 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
kwargs = config[candidate]["kwargs"]
is_async = asyncio.iscoroutinefunction(method)
for pdf_path in tqdm(glob.glob(os.path.join(pdf_directory, "*.pdf")), desc=candidate):
all_pdfs = glob.glob(os.path.join(pdf_directory, "*.pdf"))
all_pdfs.sort()
for pdf_path in tqdm(all_pdfs, desc=candidate):
base_name = os.path.basename(pdf_path).replace(".pdf", "")
for i in range(1, repeats + 1):
output_filename = f"{base_name}_{i}.md"
output_path = os.path.join(candidate_output_dir, output_filename)
if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue
try:
if is_async:
# Run async function
@ -70,8 +81,6 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
print(f"Warning, did not get output for {base_name}_{i}")
continue
output_filename = f"{base_name}_{i}.md"
output_path = os.path.join(candidate_output_dir, output_filename)
with open(output_path, "w") as out_f:
out_f.write(markdown)
@ -86,6 +95,8 @@ if __name__ == "__main__":
"Use 'name=folder_name' to specify a custom output folder name.",
)
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
parser.add_argument("--dir", type=str, default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the data folder in which to save outputs, pdfs should be in /pdfs folder within it.")
parser.add_argument("--force", action="store_true", default=False, help="Force regenerating of output files, even if they already exist")
args = parser.parse_args()
# Mapping of method names to a tuple: (module path, function name)
@ -109,8 +120,8 @@ if __name__ == "__main__":
function = getattr(module, function_name)
config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}
data_directory = os.path.join(os.path.dirname(__file__), "mining_data")
data_directory = args.dir
pdf_directory = os.path.join(data_directory, "pdfs")
# Run the async process function
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats))
asyncio.run(process_pdfs(config, pdf_directory, data_directory, args.repeats, args.force))

View File

@ -119,7 +119,7 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
best_candidate = c_sentence # Keep original capitalization for output
# Append the candidate if it passes the similarity threshold (e.g., 0.7)
if best_ratio > 0.7 and best_candidate is not None:
if best_ratio > 0.5 and best_candidate is not None:
votes.append(best_candidate.strip())
# Only consider variants that differ when compared case-insensitively
@ -191,13 +191,6 @@ def main():
# Collect all .md files from the base and compare folders
base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]
# Read all candidate texts at once
candidate_texts = []
for cf in compare_files:
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
candidate_texts.append(f.read())
all_tests = []
@ -207,6 +200,17 @@ def main():
with open(base_file_path, "r", encoding="utf-8") as f:
base_text = f.read()
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md") and re.sub(r"_\d+\.md$", "", f) == re.sub(r"_\d+\.md$", "", bf)]
if not compare_files:
print(f"skipping {bf} nothing to compare against")
# Read all candidate texts at once
candidate_texts = []
for cf in compare_files:
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
candidate_texts.append(f.read())
base_pdf_file = get_pdf_from_md(base_file_path)
base_pdf_page = 1
print(f"Results for base file: {bf}")

View File

@ -0,0 +1,33 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp., or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|---------------------|-------------|-------------|-------------|
| | (In millions) | | |
| Cost of revenue | $178 | $141 | $138 |
| Research and development | $3,423 | $2,532 | $1,892 |
| Sales, general and administrative | $1,136 | $876 | $680 |
| Total | $4,737 | $3,549 | $2,710 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

View File

@ -1,29 +1,52 @@
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?"}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "max_diffs": 2}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental"}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/"}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "max_diffs": 2}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "max_diffs": 2}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "max_diffs": 2}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 0.99}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "The-Stack-V2"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "SE, whatever we've scraped"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "HQ DCLM"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_00", "type": "present", "checked": "verified", "text": "The-Stack-V2"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_01", "type": "present", "checked": "verified", "text": "SE, whatever we've scraped"}
{"pdf": "mattsnotes.pdf", "page": 1, "id": "mattsnotes_minediff_02", "type": "present", "checked": "verified", "text": "HQ DCLM"}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "January 10th 1864."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_00", "type": "present", "checked": "verified", "text": "January 10th 1864."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_01", "type": "present", "checked": "verified", "text": "Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond."}
{"pdf": "lincoln_letter.pdf", "page": 1, "id": "lincoln_letter_minediff_03", "type": "present", "checked": "verified", "text": "He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table00", "type": "table", "cell": "Type"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table01", "type": "table", "cell": "3.32T", "left": "3.71T"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table02", "type": "table", "cell": "3.32T", "right": "21.32T"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table03", "type": "table", "cell": "11.8B", "up": "12.2B"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table04", "type": "table", "cell": "11.8B", "down": "3.7B"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table05", "type": "table", "cell": "3.32T", "top_heading": "Words"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table06", "type": "table", "cell": "arXiv", "top_heading": "Source"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table07", "type": "table", "cell": "47.2B", "top_heading": "Bytes"}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_table08", "type": "table", "cell": "Math proofs code", "left_heading": "Algebraic Stack"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Quadratic regression", "left": "Challenge"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Instrument Use", "left": "Normal"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "Procedure"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.87", "top_heading": "ReACT"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Pick-and-place object", "left_heading": "27"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "0.66", "right": "0.44"}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "olmo2-discoverworld_crazy_table4_t00", "type": "table", "cell": "Interact with a moving agent", "top_heading": "Unit Test Topic"}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero."}
{"pdf": "openstax_caculus_pg_273.pdf", "page": 1, "id": "openstax_caculus_pg_273_minediff_03", "type": "present", "threshold": 1, "checked": "verified", "text": "Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_01", "type": "present", "threshold": 1, "checked": "verified", "text": "This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version."}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_minediff_02", "type": "present", "threshold": 1, "checked": "verified", "text": "This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron."}
{"pdf": "olmo2-pg4.pdf", "page": 1, "id": "olmo2-pg4_minediff_00", "type": "present", "threshold": 1, "checked": "verified", "text": "Table 1 Composition of the pretraining data for OLMo 2."}

View File

@ -0,0 +1,55 @@
Table of Contents
NVIDIA Corporation and Subsidiaries
Notes to the Consolidated Financial Statements
(Continued)
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in
operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in
the financial statements, and the financial statement of further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of
information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the
adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis,
including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within
each income statement expense option, as applicable. We expect to adopt this standard in our fiscal year 2025 annual report. We do not expect the adoption of
the net assets of the Company in connection our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have
acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We
recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 feeling the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost
of revenue, as follows:
Year Ended
Jan 26, 2025
Jan 28, 2024
Jan 29, 2023
(In millions)
Cost of revenue
$
178
$
141
$
138
Research and development
3,423
2,532
1,092
Sales, general and administrative
1,136
676
680
Total
$
4,737
$
3,549
$
2,710
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
62

View File

@ -0,0 +1,40 @@
### **Table of Contents**
**NVIDIA Corporation and Subsidiaries Notes to the Consolidated Financial Statements** (Continued)
**Recently Issued Accounting Pronouncements**
#### **Recently Adopted Accounting Pronouncement**
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
#### **Recent Accounting Pronouncements Not Yet Adopted**
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
# **Note 2 - Business Combination**
#### **Termination of the Arm Share Purchase Agreement**
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of \$1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
# **Note 3 - Stock-Based Compensation**
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| | Year Ended | | | | | |
|-----------------------------------|--------------|----|---------------|----|--------------|--|
| | Jan 26, 2025 | | Jan 28, 2024 | | Jan 29, 2023 | |
| | | | (In millions) | | | |
| Cost of revenue | \$<br>178 | \$ | 141 | \$ | 138 | |
| Research and development | 3,423 | | 2,532 | | 1,892 | |
| Sales, general and administrative | 1,136 | | 876 | | 680 | |
| Total | \$<br>4,737 | \$ | 3,549 | \$ | 2,710 | |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.
## 62

View File

@ -0,0 +1,32 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|---------------------|-------------|-------------|-------------|
| Cost of revenue | $178 | $141 | $138 |
| Research and development | 3,423 | 2,532 | 1,892 |
| Sales, general and administrative | 1,136 | 876 | 680 |
| Total | $4,737 | $3,549 | $2,710 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

View File

@ -0,0 +1,33 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| | Year Ended |
|------------------------|------------|
| | Jan 29, 2023 | Jan 28, 2024 | Jan 29, 2023 |
| Cost of revenue | $ 178 | $ 141 | $ 138 |
| Research and development | 3,423 | 2,532 | 1,892 |
| Sales, general and administrative | 1,136 | 876 | 680 |
| Total | $ 4,737 | $ 3,549 | $ 2,710 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

View File

@ -0,0 +1,33 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| | Year Ended |
|----------------------|---------------------|
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
| Cost of revenue | $ 4,737 | $ 3,549 | $ 2,710 |
| Research and development | 3,423 | 2,532 | 1,892 |
| Sales, general and administrative | 1,136 | 876 | 680 |
| Total | $ 9,300 | $ 6,997 | $ 5,282 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

View File

@ -0,0 +1,29 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|----------------|-------------|-------------|-------------|
| Cost of revenue| $3,549 | $3,423 | $2,532 |
| Research and development| $1,892 | $2,710 | $1,136 |
| Sales, general and administrative| $138 | $141 | $178 |
| Total | $4,737 | $4,774 | $3,549 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

View File

@ -0,0 +1,29 @@
Recently Issued Accounting Pronouncements
Recently Adopted Accounting Pronouncement
In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
Recent Accounting Pronouncements Not Yet Adopted
In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
Note 2 - Business Combination
Termination of the Arm Share Purchase Agreement
In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
Note 3 - Stock-Based Compensation
Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
| Year Ended | Jan 26, 2025 | Jan 28, 2024 | Jan 29, 2023 |
|------------|--------------|--------------|--------------|
| Cost of revenue | $4,737 (In millions) | $3,549 | $2,710 |
| Research and development | 3,423 | 2,532 | 1,892 |
| Sales, general and administrative | 1,136 | 876 | 680 |
| Total | $4,737 | $3,549 | $2,710 |
Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.

Binary file not shown.

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -e
# Assuming olmocr env already exists
source activate olmocr
python -m olmocr.bench.convert olmocr --repeats 5
pip install marker-pdf
python -m olmocr.bench.convert marker
pip install verovio
python -m olmocr.bench.convert gotocr
python -m olmocr.bench.convert chatgpt
#python -m olmocr.bench.convert mineru

View File

View File

@ -0,0 +1,182 @@
#!/usr/bin/env python3
import os
import asyncio
from pathlib import Path
from playwright.async_api import async_playwright
# Simple configuration
CONFIG = {
"input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"), # React component file
"output_pdf": "book-page.pdf", # Output PDF filename
"temp_html": "temp-render.html", # Temporary HTML file
"wait_time": 1500, # Time to wait for rendering (ms)
"device_scale": 2, # Resolution multiplier
"debug": True # Keep temp files for debugging
}
async def create_html_file():
"""Create a temporary HTML file that loads the React component from a file."""
try:
# Check if input file exists
input_path = Path(CONFIG["input_file"])
if not input_path.exists():
print(f"Error: Input file '{input_path}' not found")
return False
# Read the component file
with open(input_path, 'r', encoding='utf-8') as f:
component_code = f.read()
# Create HTML that will load our component
html_content = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Book Page Template</title>
<script src="https://unpkg.com/react@17/umd/react.development.js"></script>
<script src="https://unpkg.com/react-dom@17/umd/react-dom.development.js"></script>
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
<style>
* {
box-sizing: border-box;
}
html, body {
margin: 0;
padding: 0;
width: 8.5in;
height: 11in;
overflow: hidden;
}
#root {
width: 100%;
height: 100%;
padding: 0.25in;
overflow: hidden;
}
@media print {
body {
-webkit-print-color-adjust: exact;
print-color-adjust: exact;
}
}
</style>
</head>
<body>
<div id="root"></div>
<script type="text/babel">
// The React component code loaded from external file
""" + component_code + """
// Render only the book page part, not the controls
ReactDOM.render(
<BookPageTemplate />,
document.getElementById('root')
);
</script>
</body>
</html>
"""
with open(CONFIG["temp_html"], 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"Created HTML file: {CONFIG['temp_html']}")
print(f"Using React component from: {CONFIG['input_file']}")
return True
except Exception as e:
print(f"Error creating HTML file: {e}")
print(f"Exception details: {str(e)}")
import traceback
traceback.print_exc()
return False
async def render_to_pdf():
"""Render the React component to PDF using Playwright."""
try:
# Create the HTML file first
html_created = await create_html_file()
if not html_created:
print("Failed to create HTML file")
return
print("Launching browser...")
async with async_playwright() as p:
# Launch the browser with more debugging options
browser = await p.chromium.launch(
headless=True, # True for production, False for debugging
)
# Create a new page for letter size paper
page = await browser.new_page(
viewport={"width": 816, "height": 1056}, # 8.5in x 11in at 96dpi
device_scale_factor=CONFIG["device_scale"]
)
# Get absolute path to HTML file
html_path = Path(CONFIG["temp_html"]).absolute()
html_uri = f"file://{html_path}"
print(f"Navigating to: {html_uri}")
# Add event listeners for console messages and errors
page.on("console", lambda msg: print(f"Browser console: {msg.text}"))
page.on("pageerror", lambda err: print(f"Browser page error: {err}"))
# Navigate with longer timeout and wait for network idle
await page.goto(html_uri, wait_until="networkidle", timeout=30000)
# Wait for React to render
await page.wait_for_timeout(CONFIG["wait_time"])
# Add a check to ensure the component rendered
element_count = await page.evaluate("""() => {
const root = document.getElementById('root');
return root.childElementCount;
}""")
if element_count == 0:
print("Warning: No elements found in root. Component may not have rendered.")
else:
print(f"Found {element_count} elements in root. Component rendered successfully.")
# Save debug screenshot
if CONFIG["debug"]:
await page.screenshot(path="debug-screenshot.png")
print("Debug screenshot saved")
# Generate PDF
print("Generating PDF...")
await page.pdf(
path=CONFIG["output_pdf"],
format="Letter",
print_background=True,
margin={"top": "0", "right": "0", "bottom": "0", "left": "0"}
)
print(f"PDF generated successfully: {CONFIG['output_pdf']}")
# Close the browser
await browser.close()
# Cleanup temp files if not in debug mode
if not CONFIG["debug"] and Path(CONFIG["temp_html"]).exists():
Path(CONFIG["temp_html"]).unlink()
print("Temporary HTML file removed")
except Exception as e:
print(f"Error generating PDF: {e}")
if __name__ == "__main__":
# Run the async function
try:
asyncio.run(render_to_pdf())
except Exception as e:
print(f"Fatal error: {e}")
import traceback
traceback.print_exc()

View File

@ -0,0 +1,214 @@
//import React from 'react';
const BookPageTemplate = () => {
// Only three state variables as requested
const [title, setTitle] = React.useState("ADVENTURES OF DON QUIXOTE");
const [pageNumber, setPageNumber] = React.useState("289");
const [text, setText] = React.useState(
"deed,\" said Don Quixote, \"thou hast hit the point, Sancho, which can alone shake my resolution; I neither can, nor ought to, draw my sword, as I have often told thee, against those who are not dubbed knights. To thee which I had premeditated, thy share of the booty would have been at least the emperor's crown of gold and Cupid's painted wings; for I would have plucked them off perforce, and delivered them into thy hands.\" \"The"
);
// Styles for heavily degraded scan effect
const heavilyDegradedStyles = {
filter: 'grayscale(30%) contrast(120%) brightness(85%) sepia(20%)',
position: 'relative',
backgroundColor: '#e6ddc6', // More yellowed aged paper
backgroundImage: 'url("data:image/svg+xml,%3Csvg viewBox=\'0 0 200 200\' xmlns=\'http://www.w3.org/2000/svg\'%3E%3Cfilter id=\'noiseFilter\'%3E%3CfeTurbulence type=\'fractalNoise\' baseFrequency=\'0.85\' numOctaves=\'3\' stitchTiles=\'stitch\'/%3E%3C/filter%3E%3Crect width=\'100%25\' height=\'100%25\' filter=\'url(%23noiseFilter)\' opacity=\'0.25\'/%3E%3C/svg%3E")',
boxShadow: 'inset 0 0 70px rgba(0, 0, 0, 0.3), 0 0 5px rgba(0,0,0,0.1)',
padding: '32px',
borderRadius: '2px',
overflow: 'hidden',
transform: 'rotate(0.3deg)', // Slightly askew scan
};
// Heavily degraded text
const badScanTextStyle = {
fontFamily: '"Times New Roman", serif',
letterSpacing: '-0.01em',
wordSpacing: '0.02em',
fontWeight: '500',
color: '#222222',
textShadow: '0 0 1px rgba(0, 0, 0, 0.5)',
transform: 'scale(1.01, 0.99) rotate(-0.4deg)', // Distorted proportions
};
// Random coffee stain effect
const coffeeStain = {
position: 'absolute',
width: '100px',
height: '80px',
top: '25%',
right: '15%',
borderRadius: '50%',
background: 'radial-gradient(ellipse at center, rgba(139,69,19,0.15) 0%, rgba(139,69,19,0.1) 50%, rgba(139,69,19,0.05) 70%, rgba(139,69,19,0) 100%)',
transform: 'rotate(30deg) scale(1.5, 1)',
pointerEvents: 'none',
zIndex: 1,
};
// Water damage effect
const waterDamage = {
position: 'absolute',
width: '70%',
height: '40%',
bottom: '10%',
left: '5%',
opacity: 0.07,
background: 'radial-gradient(ellipse at center, rgba(0,0,0,0.2) 0%, rgba(0,0,0,0.1) 40%, rgba(0,0,0,0) 70%)',
borderRadius: '40% 60% 70% 30% / 40% 50% 60% 50%',
pointerEvents: 'none',
zIndex: 1,
};
// Add fold lines
const foldLine = {
position: 'absolute',
width: '100%',
height: '3px',
top: '30%',
left: 0,
background: 'linear-gradient(to right, rgba(0,0,0,0) 0%, rgba(0,0,0,0.03) 20%, rgba(0,0,0,0.08) 50%, rgba(0,0,0,0.03) 80%, rgba(0,0,0,0) 100%)',
boxShadow: '0 1px 3px rgba(255,255,255,0.2)',
pointerEvents: 'none',
zIndex: 2,
};
// Torn edge effect
const tornEdge = {
position: 'absolute',
top: 0,
right: 0,
width: '100%',
height: '100%',
background: 'linear-gradient(135deg, transparent 97%, #e6ddc6 97%, #e6ddc6 100%)',
pointerEvents: 'none',
};
return (
<div style={{
maxWidth: '800px',
margin: '0 auto',
padding: '16px',
}}>
{/* Heavily degraded scan container */}
<div style={heavilyDegradedStyles}>
{/* Noise overlay */}
<div style={{
position: 'absolute',
top: 0,
left: 0,
right: 0,
bottom: 0,
background: 'url("") repeat',
opacity: 0.15,
pointerEvents: 'none',
}}></div>
{/* Scan lines effect */}
<div style={{
position: 'absolute',
top: 0,
left: 0,
right: 0,
bottom: 0,
background: 'linear-gradient(to bottom, rgba(0,0,0,0.03) 1px, transparent 1px)',
backgroundSize: '100% 2px',
opacity: 0.5,
pointerEvents: 'none',
}}></div>
{/* Add coffee stain */}
<div style={coffeeStain}></div>
{/* Add water damage */}
<div style={waterDamage}></div>
{/* Add fold line */}
<div style={foldLine}></div>
{/* Add torn edge */}
<div style={tornEdge}></div>
{/* Header with skewed alignment */}
<div style={{
display: 'flex',
justifyContent: 'space-between',
alignItems: 'center',
borderBottom: '2px solid #000',
paddingBottom: '4px',
marginBottom: '24px',
position: 'relative',
opacity: 0.8,
transform: 'skew(-0.5deg, 0.3deg)',
}}>
<div style={{width: '48px'}}></div>
<h1 style={{
...badScanTextStyle,
fontSize: '20px',
fontWeight: 'bold',
textAlign: 'center',
textTransform: 'uppercase',
letterSpacing: '1px',
opacity: 0.8,
}}>{title}</h1>
<div style={{
...badScanTextStyle,
fontSize: '20px',
fontWeight: 'bold',
opacity: 0.85,
}}>{pageNumber}</div>
</div>
{/* Horizontal divider with uneven quality */}
<div style={{
borderBottom: '1px solid #444',
marginBottom: '24px',
opacity: 0.6,
filter: 'blur(0.3px)',
transform: 'scaleY(1.5) skew(0.7deg)',
}}></div>
{/* Text content with severely degraded appearance */}
<div style={{
columnCount: 2,
columnGap: '20px',
columnRule: '1px solid rgba(0,0,0,0.1)',
textAlign: 'justify',
...badScanTextStyle,
fontSize: '16px',
lineHeight: '1.5',
opacity: 0.78,
// Very uneven ink distribution with blurry and faded parts
WebkitMaskImage: 'linear-gradient(to bottom, rgba(0,0,0,0.9), rgba(0,0,0,0.75) 50%, rgba(0,0,0,0.85))',
// Text distortion
filter: 'blur(0.2px)',
}}>
{/* Bad scan text with random character fading */}
<p>{text.split('').map((char, index) => {
const opacity = Math.random() > 0.8 ? 0.4 + Math.random() * 0.5 : 0.9 + Math.random() * 0.1;
const blur = Math.random() > 0.95 ? 1 : 0;
return <span key={index} style={{opacity, filter: `blur(${blur}px)`}}>{char}</span>;
})}</p>
</div>
{/* Extra random ink spill */}
<div style={{
position: 'absolute',
width: '10px',
height: '20px',
top: '60%',
left: '25%',
background: 'rgba(0,0,0,0.3)',
borderRadius: '50%',
transform: 'rotate(30deg)',
filter: 'blur(1px)',
zIndex: 3,
}}></div>
</div>
</div>
);
};
//export default BookPageTemplate;
window.BookPageTemplate = BookPageTemplate;

View File

@ -0,0 +1,83 @@
//import React from 'react';
const PermitGuidelinesTemplate = () => {
// Sample data - you can replace these with your own
const guidelineItems = [
{
number: 'iii.',
content: 'Not rely on personal preference or opinion, or regional interpretation of statute, regulation or guidance that is inconsistent with the Department\'s statewide interpretation. Staff should confer with the appropriate Bureau Director as necessary.'
},
{
number: 'iv.',
content: 'Process technically adequate and scientifically sound applications for final approval to minimize elapsed time in accordance with the Permit Decision Guarantee.'
},
{
number: 'v.',
content: 'Where the Application Manager determines that the technical information submitted with the application does not meet technical guidance or standards published by the Department, the application must provide the scientific or engineering basis to support the application. Note that deviations from technical guidance can generally be approved, by the appropriate section chief and manager, when warranted, provided acceptable justification has been submitted. Minor deficiencies that can be easily corrected should be addressed through a telephone call with the applicant and consultant, and may negate the need for a deficiency letter. The Program Manager or District Manager will be responsible for making that decision.'
},
{
number: 'vi.',
content: 'If an application fails to provide the technical information necessary to document that applicable regulatory and statutory requirements will be achieved, it is technically deficient and the Application Manager will prepare a technical deficiency letter. Again, all deficiencies noted must cite the statutory or regulatory obligation that the application has failed to meet and the Section Chief and the Program Manager will routinely review these letters. For District Oil and Gas Offices and District Mining Offices the Permits Chief and the Manager will review the letters.'
},
{
number: 'vii.',
content: 'Applicant responses that do not make the application technically adequate within the established response timeframe will be subject to the Elevated Review Process below. Applications that are made technically adequate within the established response timeframe will proceed to processing for final action.'
}
];
// Footnote data
const footnote = {
number: '2',
content: 'More technically complex projects and applications may receive additional deficiency letters as appropriate prior to a decision point. This exception will not void inclusion in the Permit Decision Guarantee and will follow program specific guidance that is developed. The more technically complex projects and applications are noted with an asterisk ("*") in Appendix A.'
};
// Document info
const documentInfo = "021-2100-001 / November 2, 2012 / Page 11";
// Special note about technical deficiency letter
const technicalDeficiencyNote = {
prefix: 'One',
superscript: '2',
content: ' technical deficiency letter will be sent. Each deficiency cited must note the statute, regulation or technical guidance provision. Technical guidance provides a means to compliance, but may not be used or cited when issuing a permit denial. The letter will state, as necessary, that the Permit Decision Guarantee is no longer applicable and offer the applicant an opportunity to meet and discuss the deficiencies. The letter will include a deadline for submission of the deficient information.'
};
return (
<div className="bg-white p-8 max-w-4xl mx-auto font-serif text-black">
<div className="mb-8">
{guidelineItems.map((item, index) => (
<div key={index} className="mb-6 flex">
<div className="w-12 flex-shrink-0 font-bold">{item.number}</div>
<div className="flex-grow">{item.content}</div>
</div>
))}
{/* Technical deficiency letter note */}
<div className="mb-6 ml-12">
<p>
{technicalDeficiencyNote.prefix}
<sup>{technicalDeficiencyNote.superscript}</sup>
{technicalDeficiencyNote.content}
</p>
</div>
</div>
{/* Horizontal line */}
<div className="border-t border-gray-400 my-6"></div>
{/* Footnote section */}
<div className="text-sm">
<p>
<sup>{footnote.number}</sup> {footnote.content}
</p>
</div>
{/* Document info */}
<div className="text-center mt-6 text-sm">
{documentInfo}
</div>
</div>
);
};
//export default PermitGuidelinesTemplate;
window.BookPageTemplate = PermitGuidelinesTemplate;

View File

@ -1,16 +1,112 @@
import json
import re
import numpy as np
from bs4 import BeautifulSoup
from dataclasses import asdict, dataclass
from enum import Enum
from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, Dict, Any
from fuzzysearch import find_near_matches
from rapidfuzz import fuzz
def parse_markdown_tables(md_content: str) -> List[np.ndarray]:
"""
Extract and parse all markdown tables from the provided content.
Args:
md_content: The markdown content containing tables
Returns:
A list of numpy arrays, each representing a parsed table
"""
# Extract all tables from markdown
table_pattern = r'(\|(?:[^|]*\|)+)\s*\n\|(?:[:-]+\|)+\s*\n((?:\|(?:[^|]*\|)+\s*\n)+)'
table_matches = re.finditer(table_pattern, md_content)
parsed_tables = []
for table_match in table_matches:
# Extract header and body from the table match
header_row = table_match.group(1).strip()
body_rows = table_match.group(2).strip().split('\n')
# Process header and rows to remove leading/trailing |
header_cells = [cell.strip() for cell in header_row.split('|')]
if header_cells[0] == '':
header_cells = header_cells[1:]
if header_cells[-1] == '':
header_cells = header_cells[:-1]
# Process table body rows
table_data = []
for row in [header_row] + body_rows:
if '|' not in row: # Skip separator row
continue
cells = [cell.strip() for cell in row.split('|')]
if cells[0] == '':
cells = cells[1:]
if cells[-1] == '':
cells = cells[:-1]
table_data.append(cells)
# Skip separator row (second row with dashes)
if len(table_data) > 1 and all('-' in cell for cell in table_data[1]):
table_data = [table_data[0]] + table_data[2:]
# Convert to numpy array for easier manipulation
# First ensure all rows have the same number of columns by padding if necessary
max_cols = max(len(row) for row in table_data)
padded_data = [row + [''] * (max_cols - len(row)) for row in table_data]
table_array = np.array(padded_data)
parsed_tables.append(table_array)
return parsed_tables
def parse_html_tables(html_content: str) -> List[np.ndarray]:
"""
Extract and parse all HTML tables from the provided content.
Args:
html_content: The HTML content containing tables
Returns:
A list of numpy arrays, each representing a parsed table
"""
soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')
parsed_tables = []
for table in tables:
rows = table.find_all(['tr'])
table_data = []
for row in rows:
cells = row.find_all(['th', 'td'])
row_data = [cell.get_text().strip() for cell in cells]
table_data.append(row_data)
# Ensure all rows have the same number of columns
if table_data:
max_cols = max(len(row) for row in table_data)
padded_data = [row + [''] * (max_cols - len(row)) for row in table_data]
table_array = np.array(padded_data)
parsed_tables.append(table_array)
return parsed_tables
class TestType(str, Enum):
PRESENT = "present"
ABSENT = "absent"
ORDER = "order"
TABLE = "table"
class TestChecked(str, Enum):
@ -41,18 +137,16 @@ class BasePDFTest:
page: int
id: str
type: str
threshold: float = 1.0
max_diffs: int = 0
checked: Optional[TestChecked] = None
def __post_init__(self):
self.threshold = float(self.threshold)
if not self.pdf:
raise ValidationError("PDF filename cannot be empty")
if not self.id:
raise ValidationError("Test ID cannot be empty")
if not isinstance(self.threshold, float) or not (0 <= self.threshold <= 1):
raise ValidationError(f"Threshold must be a float between 0 and 1, got {self.threshold}")
if not isinstance(self.max_diffs, int) or self.max_diffs < 0:
raise ValidationError(f"Max diffs must be positive number or 0")
if self.type not in {t.value for t in TestType}:
raise ValidationError(f"Invalid test type: {self.type}")
@ -90,7 +184,8 @@ class TextPresenceTest(BasePDFTest):
def run(self, md_content: str) -> Tuple[bool, str]:
reference_query = self.text
threshold = self.threshold
# Threshold for fuzzy matching derived from max_diffs
threshold = 1.0 - (self.max_diffs / (len(reference_query) if len(reference_query) > 0 else 1))
best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0
if self.type == TestType.PRESENT.value:
@ -130,15 +225,13 @@ class TextOrderTest(BasePDFTest):
raise ValidationError("After field cannot be empty")
def run(self, md_content: str) -> Tuple[bool, str]:
threshold = self.threshold
max_l_dist = round((1.0 - threshold) * len(self.before))
before_matches = find_near_matches(self.before, md_content, max_l_dist=max_l_dist)
after_matches = find_near_matches(self.after, md_content, max_l_dist=max_l_dist)
before_matches = find_near_matches(self.before, md_content, max_l_dist=self.max_diffs)
after_matches = find_near_matches(self.after, md_content, max_l_dist=self.max_diffs)
if not before_matches:
return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {max_l_dist}"
return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {self.max_diffs}"
if not after_matches:
return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {max_l_dist}"
return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {self.max_diffs}"
for before_match in before_matches:
for after_match in after_matches:
@ -147,6 +240,167 @@ class TextOrderTest(BasePDFTest):
return False, (f"Could not find a location where '{self.before[:40]}...' appears before " f"'{self.after[:40]}...'.")
@dataclass
class TableTest(BasePDFTest):
"""
Test to verify certain properties of a table are held, namely that some cells appear relative to other cells correctly
"""
# This is the target cell, which must exist in at least one place in the table
cell: str
# These properties say that the cell immediately up/down/left/right of the target cell has the string specified
up: str = ""
down: str = ""
left: str = ""
right: str = ""
# These properties say that the cell all the way up, or all the way left of the target cell (ex. headings) has the string value specified
top_heading: str = ""
left_heading: str = ""
def __post_init__(self):
super().__post_init__()
if self.type != TestType.TABLE.value:
raise ValidationError(f"Invalid type for TableTest: {self.type}")
def run(self, content: str) -> Tuple[bool, str]:
"""
Run the table test on provided content.
Finds all tables (markdown and/or HTML based on content_type) and checks if any cell
matches the target cell and satisfies the specified relationships.
Args:
content: The content containing tables (markdown or HTML)
Returns:
A tuple (passed, explanation) where 'passed' is True if the test passes,
and 'explanation' provides details when the test fails.
"""
# Initialize variables to track tables and results
tables_to_check = []
failed_reasons = []
# Threshold for fuzzy matching derived from max_diffs
threshold = 1.0 - (self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1))
# Parse tables based on content_type
md_tables = parse_markdown_tables(content)
tables_to_check.extend(md_tables)
html_tables = parse_html_tables(content)
tables_to_check.extend(html_tables)
# If no tables found, return failure
if not tables_to_check:
return False, f"No tables found in the content at all"
# Check each table
for table_array in tables_to_check:
# Find all cells that match the target cell using fuzzy matching
matches = []
for i in range(table_array.shape[0]):
for j in range(table_array.shape[1]):
cell_content = table_array[i, j]
similarity = fuzz.ratio(self.cell, cell_content) / 100.0
if similarity >= threshold:
matches.append((i, j))
# If no matches found in this table, continue to the next table
if not matches:
continue
# Check the relationships for each matching cell
for row_idx, col_idx in matches:
all_relationships_satisfied = True
current_failed_reasons = []
# Check up relationship
if self.up and row_idx > 0:
up_cell = table_array[row_idx - 1, col_idx]
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
if up_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})")
# Check down relationship
if self.down and row_idx < table_array.shape[0] - 1:
down_cell = table_array[row_idx + 1, col_idx]
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
if down_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})")
# Check left relationship
if self.left and col_idx > 0:
left_cell = table_array[row_idx, col_idx - 1]
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
if left_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})")
# Check right relationship
if self.right and col_idx < table_array.shape[1] - 1:
right_cell = table_array[row_idx, col_idx + 1]
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
if right_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})")
# Check top heading relationship
if self.top_heading and row_idx > 0:
# Find the first non-empty cell in the same column (starting from the top)
top_heading_cell = ""
for i in range(row_idx):
if table_array[i, col_idx].strip():
top_heading_cell = table_array[i, col_idx]
break
if not top_heading_cell:
all_relationships_satisfied = False
current_failed_reasons.append(f"No non-empty top heading found in column {col_idx}")
else:
top_similarity = fuzz.ratio(self.top_heading, top_heading_cell) / 100.0
if top_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Top heading '{top_heading_cell}' doesn't match expected '{self.top_heading}' (similarity: {top_similarity:.2f})")
# Check left heading relationship
if self.left_heading and col_idx > 0:
# Find the first non-empty cell in the same row (starting from the left)
left_heading_cell = ""
for j in range(col_idx):
if table_array[row_idx, j].strip():
left_heading_cell = table_array[row_idx, j]
break
if not left_heading_cell:
all_relationships_satisfied = False
current_failed_reasons.append(f"No non-empty left heading found in row {row_idx}")
else:
left_heading_similarity = fuzz.ratio(self.left_heading, left_heading_cell) / 100.0
if left_heading_similarity < threshold:
all_relationships_satisfied = False
current_failed_reasons.append(f"Left heading '{left_heading_cell}' doesn't match expected '{self.left_heading}' (similarity: {left_heading_similarity:.2f})")
# If all relationships are satisfied for this cell, the test passes
if all_relationships_satisfied:
return True, ""
else:
failed_reasons.extend(current_failed_reasons)
# If we've gone through all tables and all matching cells and none satisfied all relationships
if not failed_reasons:
return False, f"No cell matching '{self.cell}' found in any table with threshold {threshold}"
else:
return False, f"Found cells matching '{self.cell}' but relationships were not satisfied: {'; '.join(failed_reasons)}"
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
"""
Load tests from a JSONL file.
@ -171,6 +425,8 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
test = TextPresenceTest(**data)
elif test_type == TestType.ORDER.value:
test = TextOrderTest(**data)
elif test_type == TestType.TABLE.value:
test = TableTest(**data)
else:
raise ValidationError(f"Unknown test type: {test_type}")
@ -195,4 +451,4 @@ def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
"""
with open(jsonl_file, "w") as file:
for test in tests:
file.write(json.dumps(asdict(test)) + "\n")
file.write(json.dumps(asdict(test)) + "\n")

View File

@ -643,7 +643,7 @@ async def sglang_server_ready():
else:
logger.info(f"Attempt {attempt}: Unexpected status code {response.status_code}")
except Exception as e:
logger.warning(f"Attempt {attempt}: {e}")
logger.warning(f"Attempt {attempt}: Please wait for sglang server to become ready...")
await asyncio.sleep(delay_sec)