Fixing issues with pdf parsing

2025-12-04 19:21:08 +00:00 · 2024-10-30 16:26:02 +00:00 · 2024-10-30 16:26:02 +00:00 · 85e0e2a61b
commit 85e0e2a61b
parent 232c445a23
6 changed files with 44 additions and 8 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -21,7 +21,7 @@ requires-python = ">=3.8"
 dependencies = [
  "cached-path",
  "smart_open",
-  "pypdf",
+  "pypdf>=5.1.0",
  "pymupdf",
  "pypdfium2",
  "cryptography",
--- a/tests/gnarly_pdfs/failing_pdf_pg9.pdf
+++ b/tests/gnarly_pdfs/failing_pdf_pg9.pdf
--- a/tests/gnarly_pdfs/not_parsing.pdf
+++ b/tests/gnarly_pdfs/not_parsing.pdf
--- a/tests/gnarly_pdfs/not_parsing2.pdf
+++ b/tests/gnarly_pdfs/not_parsing2.pdf
--- a/tests/test_anchor.py
+++ b/tests/test_anchor.py
@ -74,7 +74,7 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 1000)
+        self.assertLessEqual(len(anchor_text), 1000)

    def testLargePromptHint2(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")
@ -83,7 +83,7 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 4000)
+        self.assertLessEqual(len(anchor_text), 4000)

    def testLargePromptHint3(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
@ -92,7 +92,7 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 4000)
+        self.assertLessEqual(len(anchor_text), 4000)

    def testNewsPaperPromptHint(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
@ -101,7 +101,7 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 4000)
+        self.assertLessEqual(len(anchor_text), 4000)

    def testTobaccoPaperMissingParagraphs(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
@ -110,7 +110,7 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 4000)
+        self.assertLessEqual(len(anchor_text), 4000)

    def testAnchorOtherLengths(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
@ -119,13 +119,13 @@ class AnchorTest(unittest.TestCase):

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 2000)
+        self.assertLessEqual(len(anchor_text), 2000)

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
-        self.assertLess(len(anchor_text), 6000)
+        self.assertLessEqual(len(anchor_text), 6000)

    def testFailingAnchor(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")
--- a/tests/test_birrpipeline.py
+++ b/tests/test_birrpipeline.py
@ -126,6 +126,42 @@ class TestBuildDolmaDoc(unittest.TestCase):


 class TestBuildPageQuery(unittest.TestCase):
+    def testNotParsing(self):
+        file = os.path.join(
+            os.path.dirname(__file__),
+            "gnarly_pdfs",
+            "not_parsing.pdf"
+        )
+
+        for page in range(1,9):
+            query = build_page_query(file, "not_parsing.pdf", page, 1024, 6000)
+            print(query)
+
+    def testNotParsing2(self):
+        file = os.path.join(
+            os.path.dirname(__file__),
+            "gnarly_pdfs",
+            "not_parsing2.pdf"
+        )
+
+        for page in range(1,10):
+            query = build_page_query(file, "not_parsing2.pdf", page, 1024, 6000)
+            print(query)
+
+    def testNotParsingHugeMemoryUsage(self):
+        file = os.path.join(
+            os.path.dirname(__file__),
+            "gnarly_pdfs",
+            "failing_pdf_pg9.pdf"
+        )
+
+        print("Starting to parse bad pdf")
+
+        query = build_page_query(file, "failing_pdf_pg9.pdf", 9, 1024, 6000)
+
+        print(query)
+   
+
    def testRotation(self):
        # First, generate and save the non-rotated image
        query = build_page_query(os.path.join(