mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 19:13:53 +00:00
Fix for unicode errors in big datasets for the future
This commit is contained in:
parent
44bcdc771b
commit
5d35461dd2
@ -33,3 +33,4 @@ omegaconf
|
||||
s3fs
|
||||
transformers>=4.45.1
|
||||
bitsandbytes
|
||||
ftfy
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import json
|
||||
import ftfy
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, List
|
||||
|
||||
@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
|
||||
if len(element.text.strip()) == 0:
|
||||
continue
|
||||
|
||||
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
|
||||
# Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
|
||||
# pyarrow to fail to load the json later
|
||||
result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"
|
||||
|
||||
return result
|
||||
|
||||
@ -26,6 +26,7 @@ dependencies = [
|
||||
"pypdfium2",
|
||||
"lingua-language-detector",
|
||||
"Pillow",
|
||||
"ftfy"
|
||||
]
|
||||
license = {file = "LICENSE"}
|
||||
|
||||
|
||||
BIN
tests/gnarly_pdfs/badlines.pdf
Normal file
BIN
tests/gnarly_pdfs/badlines.pdf
Normal file
Binary file not shown.
@ -1,11 +1,13 @@
|
||||
import unittest
|
||||
import os
|
||||
import json
|
||||
import io
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
||||
|
||||
|
||||
class AnchorTest(unittest.TestCase):
|
||||
def testExtractText(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
||||
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
||||
|
||||
def testBadUTFSurrogatePairsGeneration(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
|
||||
|
||||
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
|
||||
|
||||
jsondata = json.dumps({
|
||||
"text": anchor_text
|
||||
})
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.json as paj
|
||||
import pyarrow.compute as pc
|
||||
|
||||
buffer = io.BytesIO(jsondata.encode('utf-8'))
|
||||
paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
|
||||
|
||||
|
||||
|
||||
class BuildSilverTest(unittest.TestCase):
|
||||
def testSmallPage(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user