mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 19:13:53 +00:00
Fixing issues with pdf parsing
This commit is contained in:
parent
232c445a23
commit
85e0e2a61b
@ -21,7 +21,7 @@ requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"cached-path",
|
||||
"smart_open",
|
||||
"pypdf",
|
||||
"pypdf>=5.1.0",
|
||||
"pymupdf",
|
||||
"pypdfium2",
|
||||
"cryptography",
|
||||
|
||||
BIN
tests/gnarly_pdfs/failing_pdf_pg9.pdf
Normal file
BIN
tests/gnarly_pdfs/failing_pdf_pg9.pdf
Normal file
Binary file not shown.
BIN
tests/gnarly_pdfs/not_parsing.pdf
Normal file
BIN
tests/gnarly_pdfs/not_parsing.pdf
Normal file
Binary file not shown.
BIN
tests/gnarly_pdfs/not_parsing2.pdf
Normal file
BIN
tests/gnarly_pdfs/not_parsing2.pdf
Normal file
Binary file not shown.
@ -74,7 +74,7 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 1000)
|
||||
self.assertLessEqual(len(anchor_text), 1000)
|
||||
|
||||
def testLargePromptHint2(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")
|
||||
@ -83,7 +83,7 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 4000)
|
||||
self.assertLessEqual(len(anchor_text), 4000)
|
||||
|
||||
def testLargePromptHint3(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
|
||||
@ -92,7 +92,7 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 4000)
|
||||
self.assertLessEqual(len(anchor_text), 4000)
|
||||
|
||||
def testNewsPaperPromptHint(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
|
||||
@ -101,7 +101,7 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 4000)
|
||||
self.assertLessEqual(len(anchor_text), 4000)
|
||||
|
||||
def testTobaccoPaperMissingParagraphs(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
||||
@ -110,7 +110,7 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 4000)
|
||||
self.assertLessEqual(len(anchor_text), 4000)
|
||||
|
||||
def testAnchorOtherLengths(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
||||
@ -119,13 +119,13 @@ class AnchorTest(unittest.TestCase):
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 2000)
|
||||
self.assertLessEqual(len(anchor_text), 2000)
|
||||
|
||||
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)
|
||||
|
||||
print(anchor_text)
|
||||
print(len(anchor_text))
|
||||
self.assertLess(len(anchor_text), 6000)
|
||||
self.assertLessEqual(len(anchor_text), 6000)
|
||||
|
||||
def testFailingAnchor(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")
|
||||
|
||||
@ -126,6 +126,42 @@ class TestBuildDolmaDoc(unittest.TestCase):
|
||||
|
||||
|
||||
class TestBuildPageQuery(unittest.TestCase):
|
||||
def testNotParsing(self):
|
||||
file = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"gnarly_pdfs",
|
||||
"not_parsing.pdf"
|
||||
)
|
||||
|
||||
for page in range(1,9):
|
||||
query = build_page_query(file, "not_parsing.pdf", page, 1024, 6000)
|
||||
print(query)
|
||||
|
||||
def testNotParsing2(self):
|
||||
file = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"gnarly_pdfs",
|
||||
"not_parsing2.pdf"
|
||||
)
|
||||
|
||||
for page in range(1,10):
|
||||
query = build_page_query(file, "not_parsing2.pdf", page, 1024, 6000)
|
||||
print(query)
|
||||
|
||||
def testNotParsingHugeMemoryUsage(self):
|
||||
file = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"gnarly_pdfs",
|
||||
"failing_pdf_pg9.pdf"
|
||||
)
|
||||
|
||||
print("Starting to parse bad pdf")
|
||||
|
||||
query = build_page_query(file, "failing_pdf_pg9.pdf", 9, 1024, 6000)
|
||||
|
||||
print(query)
|
||||
|
||||
|
||||
def testRotation(self):
|
||||
# First, generate and save the non-rotated image
|
||||
query = build_page_query(os.path.join(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user