Fix for unicode errors in big datasets for the future

2025-11-05 20:38:40 +00:00 · 2024-10-07 17:01:59 +00:00 · 2024-10-07 17:01:59 +00:00 · 5d35461dd2
commit 5d35461dd2
parent 44bcdc771b
5 changed files with 26 additions and 1 deletions
--- a/gantry-requirements.txt
+++ b/gantry-requirements.txt
@ -33,3 +33,4 @@ omegaconf
 s3fs
 transformers>=4.45.1
 bitsandbytes
+ftfy
--- a/pdelfin/prompts/anchor.py
+++ b/pdelfin/prompts/anchor.py
@ -11,6 +11,7 @@
 import subprocess
 import sys
 import json
+import ftfy
 from dataclasses import dataclass
 from typing import Literal, List

@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
            if len(element.text.strip()) == 0:
                continue

-            result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
+            # Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
+            # pyarrow to fail to load the json later
+            result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"

    return result
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,6 +26,7 @@ dependencies = [
  "pypdfium2",
  "lingua-language-detector",
  "Pillow",
+  "ftfy"
 ]
 license = {file = "LICENSE"}

--- a/tests/gnarly_pdfs/badlines.pdf
+++ b/tests/gnarly_pdfs/badlines.pdf
--- a/tests/test_anchor.py
+++ b/tests/test_anchor.py
@ -1,11 +1,13 @@
 import unittest
 import os
 import json
+import io

 from pypdf import PdfReader

 from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text

+
 class AnchorTest(unittest.TestCase):
    def testExtractText(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):

        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))

+    def testBadUTFSurrogatePairsGeneration(self):
+        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
+
+        anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
+
+        jsondata = json.dumps({
+            "text": anchor_text
+        })
+
+        import pyarrow as pa
+        import pyarrow.json as paj
+        import pyarrow.compute as pc
+
+        buffer = io.BytesIO(jsondata.encode('utf-8'))
+        paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
+
+
+
 class BuildSilverTest(unittest.TestCase):
    def testSmallPage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")