mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-15 04:11:59 +00:00
Cleaning up anchor text to deal with abnormally long lines
This commit is contained in:
parent
b6b74b7832
commit
dc6440d068
@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
# coherency score best of these three
|
# coherency score best of these three
|
||||||
import subprocess
|
import subprocess
|
||||||
import math
|
import re
|
||||||
import ftfy
|
import ftfy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, List
|
from typing import Literal, List
|
||||||
@ -219,10 +219,38 @@ def _merge_image_elements(images: List[ImageElement], tolerance: float=0.5) -> L
|
|||||||
# Return the merged images along with other elements
|
# Return the merged images along with other elements
|
||||||
return merged_images
|
return merged_images
|
||||||
|
|
||||||
|
def _cap_split_string(text: str, max_length: int) -> str:
|
||||||
|
if len(text) <= max_length:
|
||||||
|
return text
|
||||||
|
|
||||||
|
head_length = max_length // 2 - 3
|
||||||
|
tail_length = head_length
|
||||||
|
|
||||||
|
head = text[:head_length].rsplit(' ', 1)[0] or text[:head_length]
|
||||||
|
tail = text[-tail_length:].split(' ', 1)[-1] or text[-tail_length:]
|
||||||
|
|
||||||
|
return f"{head} ... {tail}"
|
||||||
|
|
||||||
|
def _cleanup_element_text(element_text: str) -> str:
|
||||||
|
MAX_TEXT_ELEMENT_LENGTH = 250
|
||||||
|
TEXT_REPLACEMENTS = {
|
||||||
|
"[": "\\[",
|
||||||
|
"]": "\\]",
|
||||||
|
"\n": "\\n",
|
||||||
|
"\r": "\\r",
|
||||||
|
"\t": "\\t"
|
||||||
|
}
|
||||||
|
text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))
|
||||||
|
|
||||||
|
element_text = ftfy.fix_text(element_text).strip()
|
||||||
|
|
||||||
|
# Replace square brackets with escaped brackets and other escaped chars
|
||||||
|
element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)
|
||||||
|
|
||||||
|
return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)
|
||||||
|
|
||||||
def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
||||||
result = ""
|
result = ""
|
||||||
|
|
||||||
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
|
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
|
||||||
|
|
||||||
images = _merge_image_elements(report.image_elements)
|
images = _merge_image_elements(report.image_elements)
|
||||||
@ -230,7 +258,7 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
|||||||
# Process image elements
|
# Process image elements
|
||||||
image_strings = []
|
image_strings = []
|
||||||
for element in images:
|
for element in images:
|
||||||
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
|
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
|
||||||
# Use element's unique identifier (e.g., id or position) for comparison
|
# Use element's unique identifier (e.g., id or position) for comparison
|
||||||
image_strings.append((element, image_str))
|
image_strings.append((element, image_str))
|
||||||
|
|
||||||
@ -239,12 +267,9 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
|||||||
for element in report.text_elements:
|
for element in report.text_elements:
|
||||||
if len(element.text.strip()) == 0:
|
if len(element.text.strip()) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
element_text = ftfy.fix_text(element.text)
|
element_text = _cleanup_element_text(element.text)
|
||||||
# Replace square brackets with escaped brackets
|
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n"
|
||||||
element_text = element_text.replace("[", "\\[").replace("]", "\\]")
|
|
||||||
|
|
||||||
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}"
|
|
||||||
text_strings.append((element, text_str))
|
text_strings.append((element, text_str))
|
||||||
|
|
||||||
# Combine all elements with their positions for sorting
|
# Combine all elements with their positions for sorting
|
||||||
|
BIN
tests/gnarly_pdfs/large_prompt_hint3.pdf
Normal file
BIN
tests/gnarly_pdfs/large_prompt_hint3.pdf
Normal file
Binary file not shown.
@ -84,6 +84,15 @@ class AnchorTest(unittest.TestCase):
|
|||||||
print(len(anchor_text))
|
print(len(anchor_text))
|
||||||
self.assertLess(len(anchor_text), 4000)
|
self.assertLess(len(anchor_text), 4000)
|
||||||
|
|
||||||
|
def testLargePromptHint3(self):
|
||||||
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
|
||||||
|
|
||||||
|
anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")
|
||||||
|
|
||||||
|
print(anchor_text)
|
||||||
|
print(len(anchor_text))
|
||||||
|
self.assertLess(len(anchor_text), 4000)
|
||||||
|
|
||||||
def testNewsPaperPromptHint(self):
|
def testNewsPaperPromptHint(self):
|
||||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user