feat: Add store_full_path to converter (#8849)

* Add missing store_full_path to converter * Add release note * Fix pylint
2026-01-08 13:06:29 +00:00 · 2025-02-12 17:11:59 +01:00 · 2025-02-12 17:11:59 +01:00 · 71416c81bc
commit 71416c81bc
parent 043b88f181
3 changed files with 24 additions and 7 deletions
--- a/haystack/components/converters/xlsx.py
+++ b/haystack/components/converters/xlsx.py
@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0

 import io
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union

@ -49,6 +50,8 @@ class XLSXToDocument:
        sheet_name: Union[str, int, List[Union[str, int]], None] = None,
        read_excel_kwargs: Optional[Dict[str, Any]] = None,
        table_format_kwargs: Optional[Dict[str, Any]] = None,
+        *,
+        store_full_path: bool = False,
    ):
        """
        Creates a XLSXToDocument component.
@ -62,6 +65,9 @@ class XLSXToDocument:
              See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
            - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
              See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
        """
        xlsx_import.check()
        self.table_format = table_format
@ -72,6 +78,7 @@ class XLSXToDocument:
        self.sheet_name = sheet_name
        self.read_excel_kwargs = read_excel_kwargs or {}
        self.table_format_kwargs = table_format_kwargs or {}
+        self.store_full_path = store_full_path

    @component.output_types(documents=List[Document])
    def run(
@ -119,6 +126,11 @@ class XLSXToDocument:
            # Loop over tables and create a Document for each table
            for table, excel_metadata in zip(tables, tables_metadata):
                merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
+
+                if not self.store_full_path and "file_path" in bytestream.meta:
+                    file_path = bytestream.meta["file_path"]
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
                document = Document(content=table, meta=merged_metadata)
                documents.append(document)

--- a/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml
+++ b/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml
@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added the store_full_path init variable to XLSXToDocument to allow users to toggle whether to store the full path of the source file in the meta of the Document.
+    This is set to False by default to increase privacy.
--- a/test/components/converters/test_xlsx_to_document.py
+++ b/test/components/converters/test_xlsx_to_document.py
@ -15,7 +15,7 @@ class TestXLSXToDocument:
        assert converter.table_format_kwargs == {}

    def test_run_basic_tables(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=True)
        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
        documents = results["documents"]
@ -34,7 +34,7 @@ class TestXLSXToDocument:
        }

    def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=False)
        paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
        documents = results["documents"]
@ -42,12 +42,12 @@ class TestXLSXToDocument:
        assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
        assert documents[0].meta == {
            "date_added": "2022-01-01T00:00:00",
-            "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
+            "file_path": "table_empty_rows_and_columns.xlsx",
            "xlsx": {"sheet_name": "Sheet1"},
        }

    def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=True)
        paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
        documents = results["documents"]
@ -63,7 +63,7 @@ class TestXLSXToDocument:
        }

    def test_run_markdown(self, test_files_path) -> None:
-        converter = XLSXToDocument(table_format="markdown")
+        converter = XLSXToDocument(table_format="markdown", store_full_path=True)
        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
        documents = results["documents"]
@ -99,7 +99,7 @@ class TestXLSXToDocument:
    def test_run_sheet_name(
        self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
    ) -> None:
-        converter = XLSXToDocument(sheet_name=sheet_name)
+        converter = XLSXToDocument(sheet_name=sheet_name, store_full_path=True)
        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
        results = converter.run(sources=paths)
        documents = results["documents"]
@ -111,7 +111,7 @@ class TestXLSXToDocument:
        }

    def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
-        converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
+        converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}, store_full_path=True)
        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
        documents = results["documents"]