diff --git a/gantry-requirements.txt b/gantry-requirements.txt index de5f63c..ba8967b 100644 --- a/gantry-requirements.txt +++ b/gantry-requirements.txt @@ -33,3 +33,4 @@ omegaconf s3fs transformers>=4.45.1 bitsandbytes +ftfy diff --git a/pdelfin/prompts/anchor.py b/pdelfin/prompts/anchor.py index 510fbe2..f0dbe68 100644 --- a/pdelfin/prompts/anchor.py +++ b/pdelfin/prompts/anchor.py @@ -11,6 +11,7 @@ import subprocess import sys import json +import ftfy from dataclasses import dataclass from typing import Literal, List @@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str: if len(element.text.strip()) == 0: continue - result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}" + # Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause + # pyarrow to fail to load the json later + result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}" return result diff --git a/pyproject.toml b/pyproject.toml index 7ab29ac..23ffcc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "pypdfium2", "lingua-language-detector", "Pillow", + "ftfy" ] license = {file = "LICENSE"} diff --git a/tests/gnarly_pdfs/badlines.pdf b/tests/gnarly_pdfs/badlines.pdf new file mode 100644 index 0000000..7e9ea57 Binary files /dev/null and b/tests/gnarly_pdfs/badlines.pdf differ diff --git a/tests/test_anchor.py b/tests/test_anchor.py index 810eb54..4d660a2 100644 --- a/tests/test_anchor.py +++ b/tests/test_anchor.py @@ -1,11 +1,13 @@ import unittest import os import json +import io from pypdf import PdfReader from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text + class AnchorTest(unittest.TestCase): def testExtractText(self): local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf") @@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase): print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")) + def testBadUTFSurrogatePairsGeneration(self): + local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf") + + anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport") + + jsondata = json.dumps({ + "text": anchor_text + }) + + import pyarrow as pa + import pyarrow.json as paj + import pyarrow.compute as pc + + buffer = io.BytesIO(jsondata.encode('utf-8')) + paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata))) + + + class BuildSilverTest(unittest.TestCase): def testSmallPage(self): local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")