Cleaning up anchor text to deal with abnormally long lines

This commit is contained in:
Jake Poznanski 2024-10-09 16:29:20 +00:00
parent b6b74b7832
commit dc6440d068
3 changed files with 43 additions and 9 deletions

View File

@ -9,7 +9,7 @@
# coherency score best of these three # coherency score best of these three
import subprocess import subprocess
import math import re
import ftfy import ftfy
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, List from typing import Literal, List
@ -219,10 +219,38 @@ def _merge_image_elements(images: List[ImageElement], tolerance: float=0.5) -> L
# Return the merged images along with other elements # Return the merged images along with other elements
return merged_images return merged_images
def _cap_split_string(text: str, max_length: int) -> str:
if len(text) <= max_length:
return text
head_length = max_length // 2 - 3
tail_length = head_length
head = text[:head_length].rsplit(' ', 1)[0] or text[:head_length]
tail = text[-tail_length:].split(' ', 1)[-1] or text[-tail_length:]
return f"{head} ... {tail}"
def _cleanup_element_text(element_text: str) -> str:
MAX_TEXT_ELEMENT_LENGTH = 250
TEXT_REPLACEMENTS = {
"[": "\\[",
"]": "\\]",
"\n": "\\n",
"\r": "\\r",
"\t": "\\t"
}
text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))
element_text = ftfy.fix_text(element_text).strip()
# Replace square brackets with escaped brackets and other escaped chars
element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)
return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)
def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str: def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
result = "" result = ""
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n" result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
images = _merge_image_elements(report.image_elements) images = _merge_image_elements(report.image_elements)
@ -230,7 +258,7 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
# Process image elements # Process image elements
image_strings = [] image_strings = []
for element in images: for element in images:
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]" image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
# Use element's unique identifier (e.g., id or position) for comparison # Use element's unique identifier (e.g., id or position) for comparison
image_strings.append((element, image_str)) image_strings.append((element, image_str))
@ -239,12 +267,9 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
for element in report.text_elements: for element in report.text_elements:
if len(element.text.strip()) == 0: if len(element.text.strip()) == 0:
continue continue
element_text = ftfy.fix_text(element.text) element_text = _cleanup_element_text(element.text)
# Replace square brackets with escaped brackets text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n"
element_text = element_text.replace("[", "\\[").replace("]", "\\]")
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}"
text_strings.append((element, text_str)) text_strings.append((element, text_str))
# Combine all elements with their positions for sorting # Combine all elements with their positions for sorting

Binary file not shown.

View File

@ -84,6 +84,15 @@ class AnchorTest(unittest.TestCase):
print(len(anchor_text)) print(len(anchor_text))
self.assertLess(len(anchor_text), 4000) self.assertLess(len(anchor_text), 4000)
def testLargePromptHint3(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
def testNewsPaperPromptHint(self): def testNewsPaperPromptHint(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf") local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")