mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 11:04:25 +00:00
Synth data fixups
This commit is contained in:
parent
2400744673
commit
e3e09c04db
@ -366,7 +366,7 @@ async def generate_html_from_image(client, image_base64):
|
||||
"4. Render any math equations and Latex inline using either \\[ \\] or \\( \\) delimeters.\n"
|
||||
"5. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n"
|
||||
"6. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
||||
f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n"
|
||||
f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall. You can add @page and @media print css styles to make the printed version match the original document.\n\n"
|
||||
"8. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
||||
"Enclose your HTML in a ```html code block.",
|
||||
},
|
||||
@ -1226,6 +1226,7 @@ async def main():
|
||||
print(f"Found {len(pdf_paths)} PDF paths in input list")
|
||||
|
||||
# Shuffle and limit to max_tests
|
||||
random.seed(42)
|
||||
random.shuffle(pdf_paths)
|
||||
pdf_paths = pdf_paths[: args.max_tests]
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user