feat: Add store_full_path to converters (1/3) (#8566)

* Add store_full_path param to 3 converters
2025-12-28 07:29:06 +00:00 · 2024-11-22 13:55:08 +01:00 · 2024-11-22 13:55:08 +01:00 · 21906d0558
commit 21906d0558
parent 0b2a299378
7 changed files with 149 additions and 12 deletions
--- a/haystack/components/converters/csv.py
+++ b/haystack/components/converters/csv.py
@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0

 import io
+import os
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

@ -34,7 +36,7 @@ class CSVToDocument:
    ```
    """

-    def __init__(self, encoding: str = "utf-8"):
+    def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
        """
        Creates a CSVToDocument component.

@ -42,8 +44,12 @@ class CSVToDocument:
            The encoding of the csv files to convert.
            If the encoding is specified in the metadata of a source ByteStream,
            it overrides this value.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
        """
        self.encoding = encoding
+        self.store_full_path = store_full_path

    @component.output_types(documents=List[Document])
    def run(
@ -87,6 +93,19 @@ class CSVToDocument:
                continue

            merged_metadata = {**bytestream.meta, **metadata}
+
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
            document = Document(content=data, meta=merged_metadata)
            documents.append(document)

--- a/haystack/components/converters/docx.py
+++ b/haystack/components/converters/docx.py
@ -4,6 +4,8 @@

 import csv
 import io
+import os
+import warnings
 from dataclasses import dataclass
 from enum import Enum
 from io import StringIO
@ -107,15 +109,19 @@ class DOCXToDocument:
    ```
    """

-    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
+    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
        """
        Create a DOCXToDocument component.

        :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
            DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
        """
        docx_import.check()
        self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+        self.store_full_path = store_full_path

    def to_dict(self) -> Dict[str, Any]:
        """
@ -124,7 +130,7 @@ class DOCXToDocument:
        :returns:
            Dictionary with serialized data.
        """
-        return default_to_dict(self, table_format=str(self.table_format))
+        return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@ -158,7 +164,6 @@ class DOCXToDocument:
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
-
        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents
@ -184,8 +189,21 @@ class DOCXToDocument:
                )
                continue

+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
            docx_metadata = self._get_docx_metadata(document=docx_document)
            merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
+
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

--- a/haystack/components/converters/html.py
+++ b/haystack/components/converters/html.py
@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0

+import os
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

@ -33,17 +35,21 @@ class HTMLToDocument:
    ```
    """

-    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None):
+    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
        """
        Create an HTMLToDocument component.

        :param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
            are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
            the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
+        :param store_full_path:
+        If True, the full path of the file is stored in the metadata of the document.
+        If False, only the file name is stored.
        """
        trafilatura_import.check()

        self.extraction_kwargs = extraction_kwargs or {}
+        self.store_full_path = store_full_path

    def to_dict(self) -> Dict[str, Any]:
        """
@ -52,7 +58,7 @@ class HTMLToDocument:
        :returns:
            Dictionary with serialized data.
        """
-        return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)
+        return default_to_dict(self, extraction_kwargs=self.extraction_kwargs, store_full_path=self.store_full_path)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
@ -115,7 +121,20 @@ class HTMLToDocument:
                )
                continue

-            document = Document(content=text, meta={**bytestream.meta, **metadata})
+            merged_metadata = {**bytestream.meta, **metadata}
+
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
+            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

        return {"documents": documents}
--- a/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml
+++ b/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml
@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Added a new `store_full_path` parameter to the `__init__` method of `CSVToDocument`, `DOCXToDocument`, and `HTMLToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
+
+deprecations:
+  - |
+    The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
--- a/test/components/converters/test_csv_to_document.py
+++ b/test/components/converters/test_csv_to_document.py
@ -39,6 +39,25 @@ class TestCSVToDocument:
        assert docs[1].meta["file_path"] == str(files[1])
        assert docs[2].meta["file_path"] == str(files[2])

+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+        files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
+        converter = CSVToDocument(store_full_path=False)
+        output = converter.run(sources=files)
+        docs = output["documents"]
+        assert len(docs) == 3
+        assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
+        assert isinstance(docs[0].content, str)
+        assert docs[0].meta["file_path"] == "sample_1.csv"
+        assert docs[0].meta["key"] == "value"
+        assert docs[1].meta["file_path"] == "sample_2.csv"
+        assert docs[2].meta["file_path"] == "sample_3.csv"
+
    def test_run_error_handling(self, test_files_path, caplog):
        """
        Test if the component correctly handles errors.
--- a/test/components/converters/test_docx_file_to_document.py
+++ b/test/components/converters/test_docx_file_to_document.py
@ -32,7 +32,7 @@ class TestDOCXToDocument:
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
        }

    def test_to_dict_custom_parameters(self):
@ -40,28 +40,28 @@ class TestDOCXToDocument:
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
        }

        converter = DOCXToDocument(table_format="csv")
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
        }

        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
        }

        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
        }

    def test_from_dict(self):
@ -179,6 +179,38 @@ class TestDOCXToDocument:
            "Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
        ), "Text after table not found"

+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        docx_converter = DOCXToDocument(store_full_path=False)
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "History" in docs[0].content
+        assert docs[0].meta.keys() == {"file_path", "docx"}
+        assert docs[0].meta == {
+            "file_path": "sample_docx_1.docx",
+            "docx": DOCXMetadata(
+                author="Microsoft Office User",
+                category="",
+                comments="",
+                content_status="",
+                created="2024-06-09T21:17:00+00:00",
+                identifier="",
+                keywords="",
+                language="",
+                last_modified_by="Carlos Fernández Lorán",
+                last_printed=None,
+                modified="2024-06-09T21:27:00+00:00",
+                revision=2,
+                subject="",
+                title="",
+                version="",
+            ),
+        }
+
    @pytest.mark.parametrize("table_format", ["markdown", "csv"])
    def test_table_between_two_paragraphs(self, test_files_path, table_format):
        docx_converter = DOCXToDocument(table_format=table_format)
--- a/test/components/converters/test_html_to_document.py
+++ b/test/components/converters/test_html_to_document.py
@ -38,6 +38,28 @@ class TestHTMLToDocument:
        assert "Haystack" in docs[0].content
        assert docs[0].meta["file_name"] == "what_is_haystack.html"

+    def test_run_with_store_full_path(self, test_files_path):
+        """
+        Test if the component runs correctly when metadata is supplied by the user.
+        """
+        converter = HTMLToDocument()
+        sources = [test_files_path / "html" / "what_is_haystack.html"]
+
+        results = converter.run(sources=sources)  # store_full_path is True by default
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta["file_path"] == str(sources[0])
+
+        converter_2 = HTMLToDocument(store_full_path=False)
+        results = converter_2.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta["file_path"] == "what_is_haystack.html"
+
    def test_incorrect_meta(self, test_files_path):
        """
        Test if the component raises an error when incorrect metadata is supplied by the user.