mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-05 12:29:31 +00:00
Fix for unicode errors in big datasets for the future
This commit is contained in:
parent
44bcdc771b
commit
5d35461dd2
@ -33,3 +33,4 @@ omegaconf
|
|||||||
s3fs
|
s3fs
|
||||||
transformers>=4.45.1
|
transformers>=4.45.1
|
||||||
bitsandbytes
|
bitsandbytes
|
||||||
|
ftfy
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
import ftfy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, List
|
from typing import Literal, List
|
||||||
|
|
||||||
@ -163,6 +164,8 @@ def _linearize_pdf_report(report: PageReport) -> str:
|
|||||||
if len(element.text.strip()) == 0:
|
if len(element.text.strip()) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
|
# Need to use ftfy to fix text, because occasionally there are invalid surrogate pairs and other UTF issues that cause
|
||||||
|
# pyarrow to fail to load the json later
|
||||||
|
result += f"[{element.x:.0f}x{element.y:.0f}]{ftfy.fix_text(element.text)}"
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@ -26,6 +26,7 @@ dependencies = [
|
|||||||
"pypdfium2",
|
"pypdfium2",
|
||||||
"lingua-language-detector",
|
"lingua-language-detector",
|
||||||
"Pillow",
|
"Pillow",
|
||||||
|
"ftfy"
|
||||||
]
|
]
|
||||||
license = {file = "LICENSE"}
|
license = {file = "LICENSE"}
|
||||||
|
|
||||||
|
|||||||
BIN
tests/gnarly_pdfs/badlines.pdf
Normal file
BIN
tests/gnarly_pdfs/badlines.pdf
Normal file
Binary file not shown.
@ -1,11 +1,13 @@
|
|||||||
import unittest
|
import unittest
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import io
|
||||||
|
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
|
||||||
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
||||||
|
|
||||||
|
|
||||||
class AnchorTest(unittest.TestCase):
|
class AnchorTest(unittest.TestCase):
|
||||||
def testExtractText(self):
|
def testExtractText(self):
|
||||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
||||||
@ -48,6 +50,24 @@ class AnchorTest(unittest.TestCase):
|
|||||||
|
|
||||||
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
||||||
|
|
||||||
|
def testBadUTFSurrogatePairsGeneration(self):
|
||||||
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
|
||||||
|
|
||||||
|
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
|
||||||
|
|
||||||
|
jsondata = json.dumps({
|
||||||
|
"text": anchor_text
|
||||||
|
})
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pyarrow.json as paj
|
||||||
|
import pyarrow.compute as pc
|
||||||
|
|
||||||
|
buffer = io.BytesIO(jsondata.encode('utf-8'))
|
||||||
|
paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BuildSilverTest(unittest.TestCase):
|
class BuildSilverTest(unittest.TestCase):
|
||||||
def testSmallPage(self):
|
def testSmallPage(self):
|
||||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user