feat: PyPDFToDocument - add new customization parameters (#8574)

* deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting * pypdftodocument: add customization params * fix mypy * incorporate feedback
2025-06-26 22:00:13 +00:00 · 2024-11-26 16:37:59 +01:00 · 2024-11-26 16:37:59 +01:00 · fb42c035c5
commit fb42c035c5
parent 2440a5ee17
3 changed files with 227 additions and 28 deletions
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@ -4,6 +4,7 @@

 import io
 import warnings
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Union

@ -39,6 +40,33 @@ class PyPDFConverter(Protocol):
        ...


+class PyPDFExtractionMode(Enum):
+    """
+    The mode to use for extracting text from a PDF.
+    """
+
+    PLAIN = "plain"
+    LAYOUT = "layout"
+
+    def __str__(self) -> str:
+        """
+        Convert a PyPDFExtractionMode enum to a string.
+        """
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "PyPDFExtractionMode":
+        """
+        Convert a string to a PyPDFExtractionMode enum.
+        """
+        enum_map = {e.value: e for e in PyPDFExtractionMode}
+        mode = enum_map.get(string)
+        if mode is None:
+            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return mode
+
+
@component
 class PyPDFToDocument:
    """
@ -60,13 +88,49 @@ class PyPDFToDocument:
    ```
    """

-    def __init__(self, converter: Optional[PyPDFConverter] = None):
+    def __init__(
+        self,
+        converter: Optional[PyPDFConverter] = None,
+        *,
+        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
+        plain_mode_orientations: tuple = (0, 90, 180, 270),
+        plain_mode_space_width: float = 200.0,
+        layout_mode_space_vertically: bool = True,
+        layout_mode_scale_weight: float = 1.25,
+        layout_mode_strip_rotated: bool = True,
+        layout_mode_font_height_weight: float = 1.0,
+    ):
        """
        Create an PyPDFToDocument component.

        :param converter:
            An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
            For in-depth customization of the conversion process, consider implementing a custom component.
+
+        All the following parameters are applied only if `converter` is None.
+
+        :param extraction_mode:
+            The mode to use for extracting text from a PDF.
+            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
+        :param plain_mode_orientations:
+            Tuple of orientations to look for when extracting text from a PDF in plain mode.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param plain_mode_space_width:
+            Forces default space width if not extracted from font.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
+        :param layout_mode_space_vertically:
+            Whether to include blank lines inferred from y distance + font height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_scale_weight:
+            Multiplier for string length when calculating weighted average character width.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_strip_rotated:
+            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
+            If rotated text is discovered, layout will be degraded and a warning will be logged.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param layout_mode_font_height_weight:
+            Multiplier for font height when calculating blank line height.
+            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
        """
        pypdf_import.check()

@ -79,6 +143,16 @@ class PyPDFToDocument:

        self.converter = converter

+        if isinstance(extraction_mode, str):
+            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
+        self.extraction_mode = extraction_mode
+        self.plain_mode_orientations = plain_mode_orientations
+        self.plain_mode_space_width = plain_mode_space_width
+        self.layout_mode_space_vertically = layout_mode_space_vertically
+        self.layout_mode_scale_weight = layout_mode_scale_weight
+        self.layout_mode_strip_rotated = layout_mode_strip_rotated
+        self.layout_mode_font_height_weight = layout_mode_font_height_weight
+
    def to_dict(self):
        """
        Serializes the component to a dictionary.
@ -87,7 +161,15 @@ class PyPDFToDocument:
            Dictionary with serialized data.
        """
        return default_to_dict(
-            self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
+            self,
+            converter=(serialize_class_instance(self.converter) if self.converter else None),
+            extraction_mode=str(self.extraction_mode),
+            plain_mode_orientations=self.plain_mode_orientations,
+            plain_mode_space_width=self.plain_mode_space_width,
+            layout_mode_space_vertically=self.layout_mode_space_vertically,
+            layout_mode_scale_weight=self.layout_mode_scale_weight,
+            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
        )

    @classmethod
@ -108,7 +190,20 @@ class PyPDFToDocument:
        return default_from_dict(cls, data)

    def _default_convert(self, reader: "PdfReader") -> Document:
-        text = "\f".join(page.extract_text() for page in reader.pages)
+        texts = []
+        for page in reader.pages:
+            texts.append(
+                page.extract_text(
+                    orientations=self.plain_mode_orientations,
+                    extraction_mode=self.extraction_mode.value,
+                    space_width=self.plain_mode_space_width,
+                    layout_mode_space_vertically=self.layout_mode_space_vertically,
+                    layout_mode_scale_weight=self.layout_mode_scale_weight,
+                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+                )
+            )
+        text = "\f".join(texts)
        return Document(content=text)

    @component.output_types(documents=List[Document])
--- a/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
+++ b/releasenotes/notes/pypdf-add-customization-params-3da578deff7f83a5.yaml
@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
+    from PDF files.
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@ -2,17 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from unittest.mock import patch
+from unittest.mock import patch, Mock

 import pytest

 from haystack import Document, default_from_dict, default_to_dict
-from haystack.components.converters.pypdf import PyPDFToDocument
+from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
 from haystack.dataclasses import ByteStream


@pytest.fixture
-def pypdf_converter():
+def pypdf_component():
    return PyPDFToDocument()


@ -30,38 +30,104 @@ class CustomConverter:


 class TestPyPDFToDocument:
-    def test_init(self, pypdf_converter):
-        assert pypdf_converter.converter is None
+    def test_init(self, pypdf_component):
+        assert pypdf_component.converter is None
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
+        assert pypdf_component.plain_mode_space_width == 200.0
+        assert pypdf_component.layout_mode_space_vertically is True
+        assert pypdf_component.layout_mode_scale_weight == 1.25
+        assert pypdf_component.layout_mode_strip_rotated is True
+        assert pypdf_component.layout_mode_font_height_weight == 1.0

-    def test_init_params(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        assert isinstance(pypdf_converter.converter, CustomConverter)
+    def test_init_converter(self):
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        assert isinstance(pypdf_component.converter, CustomConverter)

-    def test_to_dict(self, pypdf_converter):
-        data = pypdf_converter.to_dict()
+    def test_init_custom_params(self):
+        pypdf_component = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=0.5,
+        )
+
+        assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
+        assert pypdf_component.plain_mode_orientations == (0, 90)
+        assert pypdf_component.plain_mode_space_width == 150.0
+        assert pypdf_component.layout_mode_space_vertically is False
+        assert pypdf_component.layout_mode_scale_weight == 2.0
+        assert pypdf_component.layout_mode_strip_rotated is False
+        assert pypdf_component.layout_mode_font_height_weight == 0.5
+
+    def test_init_invalid_extraction_mode(self):
+        with pytest.raises(ValueError):
+            PyPDFToDocument(extraction_mode="invalid")
+
+    def test_to_dict(self, pypdf_component):
+        data = pypdf_component.to_dict()
        assert data == {
            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
-            "init_parameters": {"converter": None},
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
        }

    def test_to_dict_custom_converter(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        data = pypdf_converter.to_dict()
+        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        data = pypdf_component.to_dict()
        assert data == {
            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
            "init_parameters": {
                "converter": {
                    "data": {"key": "value", "more": False},
                    "type": "converters.test_pypdf_to_document.CustomConverter",
-                }
+                },
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
            },
        }

    def test_from_dict(self):
-        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
+        data = {
+            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
+            "init_parameters": {
+                "converter": None,
+                "extraction_mode": "plain",
+                "plain_mode_orientations": (0, 90, 180, 270),
+                "plain_mode_space_width": 200.0,
+                "layout_mode_space_vertically": True,
+                "layout_mode_scale_weight": 1.25,
+                "layout_mode_strip_rotated": True,
+                "layout_mode_font_height_weight": 1.0,
+            },
+        }
+
        instance = PyPDFToDocument.from_dict(data)
        assert isinstance(instance, PyPDFToDocument)
        assert instance.converter is None
+        assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
+        assert instance.plain_mode_orientations == (0, 90, 180, 270)
+        assert instance.plain_mode_space_width == 200.0
+        assert instance.layout_mode_space_vertically is True
+        assert instance.layout_mode_scale_weight == 1.25
+        assert instance.layout_mode_strip_rotated is True
+        assert instance.layout_mode_font_height_weight == 1.0

    def test_from_dict_defaults(self):
        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
@ -83,30 +149,63 @@ class TestPyPDFToDocument:
        assert isinstance(instance, PyPDFToDocument)
        assert isinstance(instance.converter, CustomConverter)

+    def test_default_convert(self):
+        mock_page1 = Mock()
+        mock_page2 = Mock()
+        mock_page1.extract_text.return_value = "Page 1 content"
+        mock_page2.extract_text.return_value = "Page 2 content"
+        mock_reader = Mock()
+        mock_reader.pages = [mock_page1, mock_page2]
+
+        converter = PyPDFToDocument(
+            extraction_mode="layout",
+            plain_mode_orientations=(0, 90),
+            plain_mode_space_width=150.0,
+            layout_mode_space_vertically=False,
+            layout_mode_scale_weight=2.0,
+            layout_mode_strip_rotated=False,
+            layout_mode_font_height_weight=1.5,
+        )
+
+        doc = converter._default_convert(mock_reader)
+        assert doc.content == "Page 1 content\fPage 2 content"
+
+        expected_params = {
+            "extraction_mode": "layout",
+            "orientations": (0, 90),
+            "space_width": 150.0,
+            "layout_mode_space_vertically": False,
+            "layout_mode_scale_weight": 2.0,
+            "layout_mode_strip_rotated": False,
+            "layout_mode_font_height_weight": 1.5,
+        }
+        for mock_page in mock_reader.pages:
+            mock_page.extract_text.assert_called_once_with(**expected_params)
+
    @pytest.mark.integration
-    def test_run(self, test_files_path, pypdf_converter):
+    def test_run(self, test_files_path, pypdf_component):
        """
        Test if the component runs correctly.
        """
        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "History" in docs[0].content

    @pytest.mark.integration
-    def test_page_breaks_added(self, test_files_path, pypdf_converter):
+    def test_page_breaks_added(self, test_files_path, pypdf_component):
        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert docs[0].content.count("\f") == 3

-    def test_run_with_meta(self, test_files_path, pypdf_converter):
+    def test_run_with_meta(self, test_files_path, pypdf_component):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

        with patch("haystack.components.converters.pypdf.PdfReader"):
-            output = pypdf_converter.run(
+            output = pypdf_component.run(
                sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
            )

@ -115,17 +214,17 @@ class TestPyPDFToDocument:
        assert output["documents"][0].meta["language"] == "it"
        assert output["documents"][1].meta["language"] == "it"

-    def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
+    def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.pdf"]
        with caplog.at_level(logging.WARNING):
-            pypdf_converter.run(sources=paths)
+            pypdf_component.run(sources=paths)
            assert "Could not read non_existing_file.pdf" in caplog.text

    @pytest.mark.integration
-    def test_mixed_sources_run(self, test_files_path, pypdf_converter):
+    def test_mixed_sources_run(self, test_files_path, pypdf_component):
        """
        Test if the component runs correctly when mixed sources are provided.
        """
@ -133,7 +232,7 @@ class TestPyPDFToDocument:
        with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
            paths.append(ByteStream(f.read()))

-        output = pypdf_converter.run(sources=paths)
+        output = pypdf_component.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 2
        assert "History and standardization" in docs[0].content