Fixing issues with pdf parsing

This commit is contained in:
Jake Poznanski 2024-10-30 16:26:02 +00:00
parent 232c445a23
commit 85e0e2a61b
6 changed files with 44 additions and 8 deletions

View File

@ -21,7 +21,7 @@ requires-python = ">=3.8"
dependencies = [
"cached-path",
"smart_open",
"pypdf",
"pypdf>=5.1.0",
"pymupdf",
"pypdfium2",
"cryptography",

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -74,7 +74,7 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 1000)
self.assertLessEqual(len(anchor_text), 1000)
def testLargePromptHint2(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")
@ -83,7 +83,7 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)
def testLargePromptHint3(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
@ -92,7 +92,7 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)
def testNewsPaperPromptHint(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
@ -101,7 +101,7 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)
def testTobaccoPaperMissingParagraphs(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
@ -110,7 +110,7 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)
def testAnchorOtherLengths(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
@ -119,13 +119,13 @@ class AnchorTest(unittest.TestCase):
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 2000)
self.assertLessEqual(len(anchor_text), 2000)
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 6000)
self.assertLessEqual(len(anchor_text), 6000)
def testFailingAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")

View File

@ -126,6 +126,42 @@ class TestBuildDolmaDoc(unittest.TestCase):
class TestBuildPageQuery(unittest.TestCase):
def testNotParsing(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"not_parsing.pdf"
)
for page in range(1,9):
query = build_page_query(file, "not_parsing.pdf", page, 1024, 6000)
print(query)
def testNotParsing2(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"not_parsing2.pdf"
)
for page in range(1,10):
query = build_page_query(file, "not_parsing2.pdf", page, 1024, 6000)
print(query)
def testNotParsingHugeMemoryUsage(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"failing_pdf_pg9.pdf"
)
print("Starting to parse bad pdf")
query = build_page_query(file, "failing_pdf_pg9.pdf", 9, 1024, 6000)
print(query)
def testRotation(self):
# First, generate and save the non-rotated image
query = build_page_query(os.path.join(