feat: Add store_full_path to converter (#8849)

* Add missing store_full_path to converter

* Add release note

* Fix pylint
This commit is contained in:
Sebastian Husch Lee 2025-02-12 17:11:59 +01:00 committed by GitHub
parent 043b88f181
commit 71416c81bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 7 deletions

View File

@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import io
import os
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
@ -49,6 +50,8 @@ class XLSXToDocument:
sheet_name: Union[str, int, List[Union[str, int]], None] = None,
read_excel_kwargs: Optional[Dict[str, Any]] = None,
table_format_kwargs: Optional[Dict[str, Any]] = None,
*,
store_full_path: bool = False,
):
"""
Creates a XLSXToDocument component.
@ -62,6 +65,9 @@ class XLSXToDocument:
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
- If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
xlsx_import.check()
self.table_format = table_format
@ -72,6 +78,7 @@ class XLSXToDocument:
self.sheet_name = sheet_name
self.read_excel_kwargs = read_excel_kwargs or {}
self.table_format_kwargs = table_format_kwargs or {}
self.store_full_path = store_full_path
@component.output_types(documents=List[Document])
def run(
@ -119,6 +126,11 @@ class XLSXToDocument:
# Loop over tables and create a Document for each table
for table, excel_metadata in zip(tables, tables_metadata):
merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta["file_path"]
merged_metadata["file_path"] = os.path.basename(file_path)
document = Document(content=table, meta=merged_metadata)
documents.append(document)

View File

@ -0,0 +1,5 @@
---
enhancements:
- |
Added the store_full_path init variable to XLSXToDocument to allow users to toggle whether to store the full path of the source file in the meta of the Document.
This is set to False by default to increase privacy.

View File

@ -15,7 +15,7 @@ class TestXLSXToDocument:
assert converter.table_format_kwargs == {}
def test_run_basic_tables(self, test_files_path) -> None:
converter = XLSXToDocument()
converter = XLSXToDocument(store_full_path=True)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
@ -34,7 +34,7 @@ class TestXLSXToDocument:
}
def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
converter = XLSXToDocument()
converter = XLSXToDocument(store_full_path=False)
paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
@ -42,12 +42,12 @@ class TestXLSXToDocument:
assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
"file_path": "table_empty_rows_and_columns.xlsx",
"xlsx": {"sheet_name": "Sheet1"},
}
def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
converter = XLSXToDocument()
converter = XLSXToDocument(store_full_path=True)
paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
@ -63,7 +63,7 @@ class TestXLSXToDocument:
}
def test_run_markdown(self, test_files_path) -> None:
converter = XLSXToDocument(table_format="markdown")
converter = XLSXToDocument(table_format="markdown", store_full_path=True)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
@ -99,7 +99,7 @@ class TestXLSXToDocument:
def test_run_sheet_name(
self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
) -> None:
converter = XLSXToDocument(sheet_name=sheet_name)
converter = XLSXToDocument(sheet_name=sheet_name, store_full_path=True)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths)
documents = results["documents"]
@ -111,7 +111,7 @@ class TestXLSXToDocument:
}
def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}, store_full_path=True)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]