Fix for unicode errors in big datasets for the future

This commit is contained in:
Jake Poznanski 2024-10-07 17:01:59 +00:00
parent 44bcdc771b
commit 5d35461dd2
5 changed files with 26 additions and 1 deletions

View File

@ -33,3 +33,4 @@ omegaconf
s3fs
transformers>=4.45.1
bitsandbytes
ftfy

View File

@ -11,6 +11,7 @@
import subprocess
import sys
import json
import ftfy
from dataclasses import dataclass
from typing import Literal, List
@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
if len(element.text.strip()) == 0:
continue
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
# Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
# pyarrow to fail to load the json later
result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"
return result

View File

@ -26,6 +26,7 @@ dependencies = [
"pypdfium2",
"lingua-language-detector",
"Pillow",
"ftfy"
]
license = {file = "LICENSE"}

Binary file not shown.

View File

@ -1,11 +1,13 @@
import unittest
import os
import json
import io
from pypdf import PdfReader
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
class AnchorTest(unittest.TestCase):
def testExtractText(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
def testBadUTFSurrogatePairsGeneration(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
jsondata = json.dumps({
"text": anchor_text
})
import pyarrow as pa
import pyarrow.json as paj
import pyarrow.compute as pc
buffer = io.BytesIO(jsondata.encode('utf-8'))
paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")