From 71416c81bcda4bbe68ea6f2147d49888e37c7316 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl@users.noreply.github.com>
Date: Wed, 12 Feb 2025 17:11:59 +0100
Subject: [PATCH] feat: Add store_full_path to converter (#8849)

* Add missing store_full_path to converter

* Add release note

* Fix pylint
---
 haystack/components/converters/xlsx.py             | 12 ++++++++++++
 ...re-full-path-xlsx-convter-535bcb48433f7717.yaml |  5 +++++
 .../components/converters/test_xlsx_to_document.py | 14 +++++++-------
 3 files changed, 24 insertions(+), 7 deletions(-)
 create mode 100644 releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml

diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py
index db7dca8fe..e0c570c38 100644
--- a/haystack/components/converters/xlsx.py
+++ b/haystack/components/converters/xlsx.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
@@ -49,6 +50,8 @@ class XLSXToDocument:
         sheet_name: Union[str, int, List[Union[str, int]], None] = None,
         read_excel_kwargs: Optional[Dict[str, Any]] = None,
         table_format_kwargs: Optional[Dict[str, Any]] = None,
+        *,
+        store_full_path: bool = False,
     ):
         """
         Creates a XLSXToDocument component.
@@ -62,6 +65,9 @@ class XLSXToDocument:
               See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
             - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
               See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
         """
         xlsx_import.check()
         self.table_format = table_format
@@ -72,6 +78,7 @@ class XLSXToDocument:
         self.sheet_name = sheet_name
         self.read_excel_kwargs = read_excel_kwargs or {}
         self.table_format_kwargs = table_format_kwargs or {}
+        self.store_full_path = store_full_path
 
     @component.output_types(documents=List[Document])
     def run(
@@ -119,6 +126,11 @@ class XLSXToDocument:
             # Loop over tables and create a Document for each table
             for table, excel_metadata in zip(tables, tables_metadata):
                 merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
+
+                if not self.store_full_path and "file_path" in bytestream.meta:
+                    file_path = bytestream.meta["file_path"]
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
                 document = Document(content=table, meta=merged_metadata)
                 documents.append(document)
 
diff --git a/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml b/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml
new file mode 100644
index 000000000..a69454c52
--- /dev/null
+++ b/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added the store_full_path init variable to XLSXToDocument to allow users to toggle whether to store the full path of the source file in the meta of the Document.
+    This is set to False by default to increase privacy.
diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py
index 72964381b..bbe24cbc8 100644
--- a/test/components/converters/test_xlsx_to_document.py
+++ b/test/components/converters/test_xlsx_to_document.py
@@ -15,7 +15,7 @@ class TestXLSXToDocument:
         assert converter.table_format_kwargs == {}
 
     def test_run_basic_tables(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=True)
         paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
         results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
         documents = results["documents"]
@@ -34,7 +34,7 @@ class TestXLSXToDocument:
         }
 
     def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=False)
         paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
         results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
         documents = results["documents"]
@@ -42,12 +42,12 @@ class TestXLSXToDocument:
         assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
         assert documents[0].meta == {
             "date_added": "2022-01-01T00:00:00",
-            "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
+            "file_path": "table_empty_rows_and_columns.xlsx",
             "xlsx": {"sheet_name": "Sheet1"},
         }
 
     def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
-        converter = XLSXToDocument()
+        converter = XLSXToDocument(store_full_path=True)
         paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
         results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
         documents = results["documents"]
@@ -63,7 +63,7 @@ class TestXLSXToDocument:
         }
 
     def test_run_markdown(self, test_files_path) -> None:
-        converter = XLSXToDocument(table_format="markdown")
+        converter = XLSXToDocument(table_format="markdown", store_full_path=True)
         paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
         results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
         documents = results["documents"]
@@ -99,7 +99,7 @@ class TestXLSXToDocument:
     def test_run_sheet_name(
         self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
     ) -> None:
-        converter = XLSXToDocument(sheet_name=sheet_name)
+        converter = XLSXToDocument(sheet_name=sheet_name, store_full_path=True)
         paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
         results = converter.run(sources=paths)
         documents = results["documents"]
@@ -111,7 +111,7 @@ class TestXLSXToDocument:
         }
 
     def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
-        converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
+        converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}, store_full_path=True)
         paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
         results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
         documents = results["documents"]