feat: Add store_full_path to converters (3/3) (#8585)

* Add store_full_path params
2025-12-29 07:59:27 +00:00 · 2024-12-03 13:48:56 +05:00 · 2024-12-03 13:48:56 +05:00 · 4c8eb54049
commit 4c8eb54049
parent de7099e560
5 changed files with 82 additions and 2 deletions
--- a/haystack/components/converters/azure.py
+++ b/haystack/components/converters/azure.py
@ -4,6 +4,8 @@

 import copy
 import hashlib
+import os
+import warnings
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Union
@ -49,7 +51,7 @@ class AzureOCRDocumentConverter:
    ```
    """

-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
        self,
        endpoint: str,
        api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
@ -59,6 +61,7 @@ class AzureOCRDocumentConverter:
        merge_multiple_column_headers: bool = True,
        page_layout: Literal["natural", "single_column"] = "natural",
        threshold_y: Optional[float] = 0.05,
+        store_full_path: bool = True,
    ):
        """
        Creates an AzureOCRDocumentConverter component.
@ -83,6 +86,9 @@ class AzureOCRDocumentConverter:
            The threshold, in inches, to determine if two recognized PDF elements are grouped into a
            single line. This is crucial for section headers or numbers which may be spatially separated
            from the remaining text on the horizontal axis.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
        """
        azure_import.check()

@ -97,6 +103,7 @@ class AzureOCRDocumentConverter:
        self.merge_multiple_column_headers = merge_multiple_column_headers
        self.page_layout = page_layout
        self.threshold_y = threshold_y
+        self.store_full_path = store_full_path
        if self.page_layout == "single_column" and self.threshold_y is None:
            self.threshold_y = 0.05

@ -136,6 +143,15 @@ class AzureOCRDocumentConverter:
            azure_output.append(result.to_dict())

            merged_metadata = {**bytestream.meta, **metadata}
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
+            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
+                merged_metadata["file_path"] = os.path.basename(file_path)
            docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
            documents.extend(docs)

@ -158,6 +174,7 @@ class AzureOCRDocumentConverter:
            merge_multiple_column_headers=self.merge_multiple_column_headers,
            page_layout=self.page_layout,
            threshold_y=self.threshold_y,
+            store_full_path=self.store_full_path,
        )

    @classmethod
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0

 import io
+import os
 import warnings
 from enum import Enum
 from pathlib import Path
@ -99,6 +100,7 @@ class PyPDFToDocument:
        layout_mode_scale_weight: float = 1.25,
        layout_mode_strip_rotated: bool = True,
        layout_mode_font_height_weight: float = 1.0,
+        store_full_path: bool = True,
    ):
        """
        Create an PyPDFToDocument component.
@ -131,6 +133,9 @@ class PyPDFToDocument:
        :param layout_mode_font_height_weight:
            Multiplier for font height when calculating blank line height.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
        """
        pypdf_import.check()

@ -142,6 +147,7 @@ class PyPDFToDocument:
            warnings.warn(msg, DeprecationWarning)

        self.converter = converter
+        self.store_full_path = store_full_path

        if isinstance(extraction_mode, str):
            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
@ -170,6 +176,7 @@ class PyPDFToDocument:
            layout_mode_scale_weight=self.layout_mode_scale_weight,
            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
+            store_full_path=self.store_full_path,
        )

    @classmethod
@ -255,6 +262,14 @@ class PyPDFToDocument:
                )

            merged_metadata = {**bytestream.meta, **metadata}
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
+                merged_metadata["file_path"] = os.path.basename(file_path)
            document.meta = merged_metadata
            documents.append(document)

--- a/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml
+++ b/releasenotes/notes/add-store-full-path-param-to-converters-5bd32a7561abfe78.yaml
@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Added a new `store_full_path` parameter to the `__init__` methods of `PyPDFToDocument` and `AzureOCRDocumentConverter`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
+
+deprecations:
+  - |
+    The default value of the `store_full_path` parameter in `PyPDFToDocument` and `AzureOCRDocumentConverter` will change to `False` in Haysatck 2.9.0 to enhance privacy.
--- a/test/components/converters/test_azure_ocr_doc_converter.py
+++ b/test/components/converters/test_azure_ocr_doc_converter.py
@ -105,6 +105,7 @@ class TestAzureOCRDocumentConverter:
                "page_layout": "natural",
                "preceding_context_len": 3,
                "threshold_y": 0.05,
+                "store_full_path": True,
            },
        }

@ -278,6 +279,9 @@ class TestAzureOCRDocumentConverter:
    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
    def test_run_with_docx_file(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
        component = AzureOCRDocumentConverter(
            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
        )
@ -288,6 +292,21 @@ class TestAzureOCRDocumentConverter:
        assert "Now we are in Page 2" in documents[0].content
        assert "Page 3 was empty this is page 4" in documents[0].content

+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
+    def test_run_with_store_full_path_false(self, test_files_path):
+        component = AzureOCRDocumentConverter(
+            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"],
+            api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"),
+            store_full_path=False,
+        )
+        output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "Sample Docx File" in documents[0].content
+        assert documents[0].meta["file_path"] == "sample_docx.docx"
+
    @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
    def test_hashing_dataframe(self, mock_resolve_value):
        mock_resolve_value.return_value = "test_api_key"
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@ -80,11 +80,12 @@ class TestPyPDFToDocument:
                "layout_mode_scale_weight": 1.25,
                "layout_mode_strip_rotated": True,
                "layout_mode_font_height_weight": 1.0,
+                "store_full_path": True,
            },
        }

    def test_to_dict_custom_converter(self):
-        pypdf_component = PyPDFToDocument(converter=CustomConverter())
+        pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False)
        data = pypdf_component.to_dict()
        assert data == {
            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
@ -100,6 +101,7 @@ class TestPyPDFToDocument:
                "layout_mode_scale_weight": 1.25,
                "layout_mode_strip_rotated": True,
                "layout_mode_font_height_weight": 1.0,
+                "store_full_path": False,
            },
        }

@ -214,6 +216,25 @@ class TestPyPDFToDocument:
        assert output["documents"][0].meta["language"] == "it"
        assert output["documents"][1].meta["language"] == "it"

+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        converter = PyPDFToDocument(store_full_path=True)
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert docs[0].meta["file_path"] == str(sources[0])
+
+        converter = PyPDFToDocument(store_full_path=False)
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"
+
    def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
        """
        Test if the component correctly handles errors.