feat: Add page breaks to default PDF to Document converter (#6755)

* Speedup tests for PyPDFToDocument * Added unit test and removed skipping of empty pages * add release note * Add back some integration marks
2025-12-03 02:16:34 +00:00 · 2024-01-18 08:54:59 +01:00 · 2024-01-18 08:54:59 +01:00 · c0b67432e4
commit c0b67432e4
parent eaec5bfe4a
4 changed files with 44 additions and 27 deletions
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@ -31,7 +31,7 @@ class DefaultConverter:
    def convert(self, reader: "PdfReader") -> Document:
        """Extract text from the PDF and return a Document object with the text content."""
-        text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
+        text = "\f".join(page.extract_text() for page in reader.pages)
        return Document(content=text)
--- a/haystack/components/converters/utils.py
+++ b/haystack/components/converters/utils.py
@ -28,7 +28,7 @@ def normalize_metadata(
    makes sure to return a list of dictionaries of the correct length for the converter to use.
    :param meta: the meta input of the converter, as-is
-    :sources_count: the number of sources the converter received
+    :param sources_count: the number of sources the converter received
    :returns: a list of dictionaries of the make length as the sources list
    """
    if meta is None:
--- a/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml
+++ b/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml
@ -0,0 +1,7 @@
 ---
 upgrade:
  - |
    Upgraded the default converter in PyPDFToDocument to insert page breaks "\f"
    between each extracted page.
    This allows for downstream components and applications to better be able to
    keep track of the original PDF page a portion of text comes from.
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG
 from haystack.dataclasses import ByteStream
-@pytest.mark.integration
+@pytest.fixture
 def pypdf_converter():
    return PyPDFToDocument()
 class TestPyPDFToDocument:
-    def test_init(self):
+    def test_init(self, pypdf_converter):
-        component = PyPDFToDocument()
+        assert pypdf_converter.converter_name == "default"
-        assert component.converter_name == "default"
+        assert hasattr(pypdf_converter, "_converter")
        assert hasattr(component, "_converter")
    def test_init_fail_nonexisting_converter(self):
        with pytest.raises(ValueError):
            PyPDFToDocument(converter_name="non_existing_converter")
-    def test_run(self, test_files_path):
+    @pytest.mark.integration
    def test_run(self, test_files_path, pypdf_converter):
        """
        Test if the component runs correctly.
        """
-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        converter = PyPDFToDocument()
+        output = pypdf_converter.run(sources=paths)
        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
-        assert "ReAct" in docs[0].content
+        assert "History" in docs[0].content
-    def test_run_with_meta(self, test_files_path):
+    @pytest.mark.integration
    def test_page_breaks_added(self, test_files_path, pypdf_converter):
        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
        output = pypdf_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert docs[0].content.count("\f") == 3
    def test_run_with_meta(self, test_files_path, pypdf_converter):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
        converter = PyPDFToDocument()
        with patch("haystack.components.converters.pypdf.PdfReader"):
-            output = converter.run(
+            output = pypdf_converter.run(
-                sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
+                sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
            )
        # check that the metadata from the bytestream is merged with that from the meta parameter
@ -43,38 +53,38 @@ class TestPyPDFToDocument:
        assert output["documents"][0].meta["language"] == "it"
        assert output["documents"][1].meta["language"] == "it"
-    def test_run_error_handling(self, test_files_path, caplog):
+    def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.pdf"]
        converter = PyPDFToDocument()
        with caplog.at_level(logging.WARNING):
-            converter.run(sources=paths)
+            pypdf_converter.run(sources=paths)
            assert "Could not read non_existing_file.pdf" in caplog.text
-    def test_mixed_sources_run(self, test_files_path):
+    @pytest.mark.integration
    def test_mixed_sources_run(self, test_files_path, pypdf_converter):
        """
        Test if the component runs correctly when mixed sources are provided.
        """
-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
+        with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
            paths.append(ByteStream(f.read()))
-        converter = PyPDFToDocument()
+        output = pypdf_converter.run(sources=paths)
        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 2
-        assert "ReAct" in docs[0].content
+        assert "History and standardization" in docs[0].content
-        assert "ReAct" in docs[1].content
+        assert "History and standardization" in docs[1].content
    @pytest.mark.integration
    def test_custom_converter(self, test_files_path):
        """
        Test if the component correctly handles custom converters.
        """
        from pypdf import PdfReader
-        paths = [test_files_path / "pdf" / "react_paper.pdf"]
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
        class MyCustomConverter:
            def convert(self, reader: PdfReader) -> Document: