mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
SMall adjustments to synthetic data pipeline
This commit is contained in:
parent
678c000685
commit
fb8b23d506
2
.gitignore
vendored
2
.gitignore
vendored
@ -18,6 +18,8 @@ gpt4otestset_output/*
|
|||||||
pdfs/*
|
pdfs/*
|
||||||
olmOCR-bench/*
|
olmOCR-bench/*
|
||||||
table_data*/
|
table_data*/
|
||||||
|
synth*/
|
||||||
|
dolma_samples/*
|
||||||
/*.html
|
/*.html
|
||||||
scoreelo.csv
|
scoreelo.csv
|
||||||
debug.log
|
debug.log
|
||||||
|
@ -54,7 +54,7 @@ def generate_html_from_image(client, image_base64):
|
|||||||
"2. What are the main sections and content types (headings, paragraphs, lists, tables, images, etc.)?\n"
|
"2. What are the main sections and content types (headings, paragraphs, lists, tables, images, etc.)?\n"
|
||||||
"3. Does it have headers, footers, page numbers, or other special elements?\n"
|
"3. Does it have headers, footers, page numbers, or other special elements?\n"
|
||||||
"4. Is there any complex formatting that would be challenging to reproduce in HTML?\n\n"
|
"4. Is there any complex formatting that would be challenging to reproduce in HTML?\n\n"
|
||||||
"Please be very precise about the number of columns and how they're arranged."
|
"Please be very precise about the number of columns and how they're arranged.",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -88,7 +88,7 @@ def generate_html_from_image(client, image_base64):
|
|||||||
"5. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
"5. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
||||||
f"6. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n"
|
f"6. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n"
|
||||||
"7. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
"7. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
||||||
"Enclose your HTML in a ```html code block."
|
"Enclose your HTML in a ```html code block.",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -178,7 +178,7 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
|
|||||||
Returns:
|
Returns:
|
||||||
bool: True if rendering was successful with exactly one page, False otherwise
|
bool: True if rendering was successful with exactly one page, False otherwise
|
||||||
"""
|
"""
|
||||||
scale_factors = [1.0, 0.9, 0.8, 0.7] # Try these scale factors in order
|
scale_factors = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5] # Try these scale factors in order
|
||||||
|
|
||||||
for scale in scale_factors:
|
for scale in scale_factors:
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user