feat: Add page breaks to default PDF to Document converter (#6755)

* Speedup tests for PyPDFToDocument * Added unit test and removed skipping of empty pages * add release note * Add back some integration marks
2025-12-03 02:16:34 +00:00 · 2024-01-18 08:54:59 +01:00 · 2024-01-18 08:54:59 +01:00 · c0b67432e4
commit c0b67432e4
parent eaec5bfe4a
4 changed files with 44 additions and 27 deletions
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@ -31,7 +31,7 @@ class DefaultConverter:

    def convert(self, reader: "PdfReader") -> Document:
        """Extract text from the PDF and return a Document object with the text content."""
-        text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
+        text = "\f".join(page.extract_text() for page in reader.pages)
        return Document(content=text)


--- a/haystack/components/converters/utils.py
+++ b/haystack/components/converters/utils.py
@ -28,7 +28,7 @@ def normalize_metadata(
    makes sure to return a list of dictionaries of the correct length for the converter to use.

    :param meta: the meta input of the converter, as-is
-    :sources_count: the number of sources the converter received
+    :param sources_count: the number of sources the converter received
    :returns: a list of dictionaries of the make length as the sources list
    """
    if meta is None:
--- a/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml
+++ b/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml
@ -0,0 +1,7 @@
+---
+upgrade:
+  - |
+    Upgraded the default converter in PyPDFToDocument to insert page breaks "\f"
+    between each extracted page.
+    This allows for downstream components and applications to better be able to
+    keep track of the original PDF page a portion of text comes from.
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG
 from haystack.dataclasses import ByteStream


-@pytest.mark.integration
+@pytest.fixture
+def pypdf_converter():
+    return PyPDFToDocument()
+
+
 class TestPyPDFToDocument:
-    def test_init(self):
-        component = PyPDFToDocument()
-        assert component.converter_name == "default"
-        assert hasattr(component, "_converter")
+    def test_init(self, pypdf_converter):
+        assert pypdf_converter.converter_name == "default"
+        assert hasattr(pypdf_converter, "_converter")

    def test_init_fail_nonexisting_converter(self):
        with pytest.raises(ValueError):
            PyPDFToDocument(converter_name="non_existing_converter")

-    def test_run(self, test_files_path):
+    @pytest.mark.integration
+    def test_run(self, test_files_path, pypdf_converter):
        """
        Test if the component runs correctly.
        """
-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
-        converter = PyPDFToDocument()
-        output = converter.run(sources=paths)
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        output = pypdf_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
-        assert "ReAct" in docs[0].content
+        assert "History" in docs[0].content

-    def test_run_with_meta(self, test_files_path):
+    @pytest.mark.integration
+    def test_page_breaks_added(self, test_files_path, pypdf_converter):
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        output = pypdf_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert docs[0].content.count("\f") == 3
+
+    def test_run_with_meta(self, test_files_path, pypdf_converter):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

-        converter = PyPDFToDocument()
        with patch("haystack.components.converters.pypdf.PdfReader"):
-            output = converter.run(
-                sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
+            output = pypdf_converter.run(
+                sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
            )

        # check that the metadata from the bytestream is merged with that from the meta parameter
@ -43,38 +53,38 @@ class TestPyPDFToDocument:
        assert output["documents"][0].meta["language"] == "it"
        assert output["documents"][1].meta["language"] == "it"

-    def test_run_error_handling(self, test_files_path, caplog):
+    def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.pdf"]
-        converter = PyPDFToDocument()
        with caplog.at_level(logging.WARNING):
-            converter.run(sources=paths)
+            pypdf_converter.run(sources=paths)
            assert "Could not read non_existing_file.pdf" in caplog.text

-    def test_mixed_sources_run(self, test_files_path):
+    @pytest.mark.integration
+    def test_mixed_sources_run(self, test_files_path, pypdf_converter):
        """
        Test if the component runs correctly when mixed sources are provided.
        """
-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
-        with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
            paths.append(ByteStream(f.read()))

-        converter = PyPDFToDocument()
-        output = converter.run(sources=paths)
+        output = pypdf_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 2
-        assert "ReAct" in docs[0].content
-        assert "ReAct" in docs[1].content
+        assert "History and standardization" in docs[0].content
+        assert "History and standardization" in docs[1].content

+    @pytest.mark.integration
    def test_custom_converter(self, test_files_path):
        """
        Test if the component correctly handles custom converters.
        """
        from pypdf import PdfReader

-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]

        class MyCustomConverter:
            def convert(self, reader: PdfReader) -> Document: