mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-17 10:58:58 +00:00
Synth data fixups
This commit is contained in:
parent
2400744673
commit
e3e09c04db
@ -366,7 +366,7 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
"4. Render any math equations and Latex inline using either \\[ \\] or \\( \\) delimeters.\n"
|
"4. Render any math equations and Latex inline using either \\[ \\] or \\( \\) delimeters.\n"
|
||||||
"5. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n"
|
"5. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n"
|
||||||
"6. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
"6. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
||||||
f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n"
|
f"7. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall. You can add @page and @media print css styles to make the printed version match the original document.\n\n"
|
||||||
"8. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
"8. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
||||||
"Enclose your HTML in a ```html code block.",
|
"Enclose your HTML in a ```html code block.",
|
||||||
},
|
},
|
||||||
@ -1226,6 +1226,7 @@ async def main():
|
|||||||
print(f"Found {len(pdf_paths)} PDF paths in input list")
|
print(f"Found {len(pdf_paths)} PDF paths in input list")
|
||||||
|
|
||||||
# Shuffle and limit to max_tests
|
# Shuffle and limit to max_tests
|
||||||
|
random.seed(42)
|
||||||
random.shuffle(pdf_paths)
|
random.shuffle(pdf_paths)
|
||||||
pdf_paths = pdf_paths[: args.max_tests]
|
pdf_paths = pdf_paths[: args.max_tests]
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user