mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-09 16:50:02 +00:00
Fix for anchor generation on pdfs with no text elements
This commit is contained in:
parent
af03358c47
commit
aea3f7f1fe
@ -302,11 +302,12 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
|||||||
|
|
||||||
if report.text_elements:
|
if report.text_elements:
|
||||||
text_elements = [e for e in report.text_elements if len(e.text.strip()) > 0]
|
text_elements = [e for e in report.text_elements if len(e.text.strip()) > 0]
|
||||||
min_x_text = min(text_elements, key=lambda e: e.x)
|
if text_elements:
|
||||||
max_x_text = max(text_elements, key=lambda e: e.x)
|
min_x_text = min(text_elements, key=lambda e: e.x)
|
||||||
min_y_text = min(text_elements, key=lambda e: e.y)
|
max_x_text = max(text_elements, key=lambda e: e.x)
|
||||||
max_y_text = max(text_elements, key=lambda e: e.y)
|
min_y_text = min(text_elements, key=lambda e: e.y)
|
||||||
edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text])
|
max_y_text = max(text_elements, key=lambda e: e.y)
|
||||||
|
edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text])
|
||||||
|
|
||||||
# Keep track of element IDs to prevent duplication
|
# Keep track of element IDs to prevent duplication
|
||||||
selected_element_ids = set()
|
selected_element_ids = set()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user