Fix for unicode errors in big datasets for the future

This commit is contained in:
Jake Poznanski 2024-10-07 17:01:59 +00:00
parent 44bcdc771b
commit 5d35461dd2
5 changed files with 26 additions and 1 deletions

View File

@ -33,3 +33,4 @@ omegaconf
s3fs s3fs
transformers>=4.45.1 transformers>=4.45.1
bitsandbytes bitsandbytes
ftfy

View File

@ -11,6 +11,7 @@
import subprocess import subprocess
import sys import sys
import json import json
import ftfy
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, List from typing import Literal, List
@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
if len(element.text.strip()) == 0: if len(element.text.strip()) == 0:
continue continue
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}" # Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
# pyarrow to fail to load the json later
result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"
return result return result

View File

@ -26,6 +26,7 @@ dependencies = [
"pypdfium2", "pypdfium2",
"lingua-language-detector", "lingua-language-detector",
"Pillow", "Pillow",
"ftfy"
] ]
license = {file = "LICENSE"} license = {file = "LICENSE"}

Binary file not shown.

View File

@ -1,11 +1,13 @@
import unittest import unittest
import os import os
import json import json
import io
from pypdf import PdfReader from pypdf import PdfReader
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
class AnchorTest(unittest.TestCase): class AnchorTest(unittest.TestCase):
def testExtractText(self): def testExtractText(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf") local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")) print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
def testBadUTFSurrogatePairsGeneration(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
jsondata = json.dumps({
"text": anchor_text
})
import pyarrow as pa
import pyarrow.json as paj
import pyarrow.compute as pc
buffer = io.BytesIO(jsondata.encode('utf-8'))
paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
class BuildSilverTest(unittest.TestCase): class BuildSilverTest(unittest.TestCase):
def testSmallPage(self): def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf") local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")