From 71416c81bcda4bbe68ea6f2147d49888e37c7316 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 12 Feb 2025 17:11:59 +0100 Subject: [PATCH] feat: Add store_full_path to converter (#8849) * Add missing store_full_path to converter * Add release note * Fix pylint --- haystack/components/converters/xlsx.py | 12 ++++++++++++ ...re-full-path-xlsx-convter-535bcb48433f7717.yaml | 5 +++++ .../components/converters/test_xlsx_to_document.py | 14 +++++++------- 3 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index db7dca8fe..e0c570c38 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Tuple, Union @@ -49,6 +50,8 @@ class XLSXToDocument: sheet_name: Union[str, int, List[Union[str, int]], None] = None, read_excel_kwargs: Optional[Dict[str, Any]] = None, table_format_kwargs: Optional[Dict[str, Any]] = None, + *, + store_full_path: bool = False, ): """ Creates a XLSXToDocument component. @@ -62,6 +65,9 @@ class XLSXToDocument: See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`. See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ xlsx_import.check() self.table_format = table_format @@ -72,6 +78,7 @@ class XLSXToDocument: self.sheet_name = sheet_name self.read_excel_kwargs = read_excel_kwargs or {} self.table_format_kwargs = table_format_kwargs or {} + self.store_full_path = store_full_path @component.output_types(documents=List[Document]) def run( @@ -119,6 +126,11 @@ class XLSXToDocument: # Loop over tables and create a Document for each table for table, excel_metadata in zip(tables, tables_metadata): merged_metadata = {**bytestream.meta, **metadata, **excel_metadata} + + if not self.store_full_path and "file_path" in bytestream.meta: + file_path = bytestream.meta["file_path"] + merged_metadata["file_path"] = os.path.basename(file_path) + document = Document(content=table, meta=merged_metadata) documents.append(document) diff --git a/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml b/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml new file mode 100644 index 000000000..a69454c52 --- /dev/null +++ b/releasenotes/notes/add-store-full-path-xlsx-convter-535bcb48433f7717.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + Added the store_full_path init variable to XLSXToDocument to allow users to toggle whether to store the full path of the source file in the meta of the Document. + This is set to False by default to increase privacy. diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py index 72964381b..bbe24cbc8 100644 --- a/test/components/converters/test_xlsx_to_document.py +++ b/test/components/converters/test_xlsx_to_document.py @@ -15,7 +15,7 @@ class TestXLSXToDocument: assert converter.table_format_kwargs == {} def test_run_basic_tables(self, test_files_path) -> None: - converter = XLSXToDocument() + converter = XLSXToDocument(store_full_path=True) paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] @@ -34,7 +34,7 @@ class TestXLSXToDocument: } def test_run_table_empty_rows_and_columns(self, test_files_path) -> None: - converter = XLSXToDocument() + converter = XLSXToDocument(store_full_path=False) paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] @@ -42,12 +42,12 @@ class TestXLSXToDocument: assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n" assert documents[0].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"), + "file_path": "table_empty_rows_and_columns.xlsx", "xlsx": {"sheet_name": "Sheet1"}, } def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None: - converter = XLSXToDocument() + converter = XLSXToDocument(store_full_path=True) paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] @@ -63,7 +63,7 @@ class TestXLSXToDocument: } def test_run_markdown(self, test_files_path) -> None: - converter = XLSXToDocument(table_format="markdown") + converter = XLSXToDocument(table_format="markdown", store_full_path=True) paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] @@ -99,7 +99,7 @@ class TestXLSXToDocument: def test_run_sheet_name( self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path ) -> None: - converter = XLSXToDocument(sheet_name=sheet_name) + converter = XLSXToDocument(sheet_name=sheet_name, store_full_path=True) paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths) documents = results["documents"] @@ -111,7 +111,7 @@ class TestXLSXToDocument: } def test_run_with_read_excel_kwargs(self, test_files_path) -> None: - converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}) + converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}, store_full_path=True) paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"]