Fix for unicode errors in big datasets for the future

2025-11-05 12:29:31 +00:00 · 2024-10-07 17:01:59 +00:00 · 2024-10-07 17:01:59 +00:00 · 5d35461dd2
commit 5d35461dd2
parent 44bcdc771b
5 changed files with 26 additions and 1 deletions
--- a/gantry-requirements.txt
+++ b/gantry-requirements.txt
@ -33,3 +33,4 @@ omegaconf
 s3fs
 transformers>=4.45.1
 bitsandbytes
 ftfy
--- a/pdelfin/prompts/anchor.py
+++ b/pdelfin/prompts/anchor.py
@ -11,6 +11,7 @@
 import subprocess
 import sys
 import json
 import ftfy
 from dataclasses import dataclass
 from typing import Literal, List
@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
            if len(element.text.strip()) == 0:
                continue
-            result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
+            # Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
            # pyarrow to fail to load the json later
            result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"
    return result
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,6 +26,7 @@ dependencies = [
  "pypdfium2",
  "lingua-language-detector",
  "Pillow",
  "ftfy"
 ]
 license = {file = "LICENSE"}
--- a/tests/gnarly_pdfs/badlines.pdf
+++ b/tests/gnarly_pdfs/badlines.pdf
--- a/tests/test_anchor.py
+++ b/tests/test_anchor.py
@ -1,11 +1,13 @@
 import unittest
 import os
 import json
 import io
 from pypdf import PdfReader
 from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
 class AnchorTest(unittest.TestCase):
    def testExtractText(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):
        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
    def testBadUTFSurrogatePairsGeneration(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
        anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
        jsondata = json.dumps({
            "text": anchor_text
        })
        import pyarrow as pa
        import pyarrow.json as paj
        import pyarrow.compute as pc
        buffer = io.BytesIO(jsondata.encode('utf-8'))
        paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
 class BuildSilverTest(unittest.TestCase):
    def testSmallPage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")