From e3e09c04dbeab4b8ebd9789838e085e012f98eb2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 16 Sep 2025 18:47:54 +0000 Subject: [PATCH] Synth data fixups --- olmocr/bench/synth/mine_html_templates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index f7816cf..8fc5f52 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -366,7 +366,7 @@ async def generate_html_from_image(client, image_base64): "4. Render any math equations and Latex inline using either \\[ \\] or \\( \\) delimeters.\n" "5. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n" "6. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n" - f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n" + f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall. You can add @page and @media print css styles to make the printed version match the original document.\n\n" "8. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n" "Enclose your HTML in a ```html code block.", }, @@ -1226,6 +1226,7 @@ async def main(): print(f"Found {len(pdf_paths)} PDF paths in input list") # Shuffle and limit to max_tests + random.seed(42) random.shuffle(pdf_paths) pdf_paths = pdf_paths[: args.max_tests]