mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
feat: Add store_full_path to converter (#8849)
* Add missing store_full_path to converter * Add release note * Fix pylint
This commit is contained in:
parent
043b88f181
commit
71416c81bc
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
||||
|
||||
@ -49,6 +50,8 @@ class XLSXToDocument:
|
||||
sheet_name: Union[str, int, List[Union[str, int]], None] = None,
|
||||
read_excel_kwargs: Optional[Dict[str, Any]] = None,
|
||||
table_format_kwargs: Optional[Dict[str, Any]] = None,
|
||||
*,
|
||||
store_full_path: bool = False,
|
||||
):
|
||||
"""
|
||||
Creates a XLSXToDocument component.
|
||||
@ -62,6 +65,9 @@ class XLSXToDocument:
|
||||
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
|
||||
- If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
|
||||
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
xlsx_import.check()
|
||||
self.table_format = table_format
|
||||
@ -72,6 +78,7 @@ class XLSXToDocument:
|
||||
self.sheet_name = sheet_name
|
||||
self.read_excel_kwargs = read_excel_kwargs or {}
|
||||
self.table_format_kwargs = table_format_kwargs or {}
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(
|
||||
@ -119,6 +126,11 @@ class XLSXToDocument:
|
||||
# Loop over tables and create a Document for each table
|
||||
for table, excel_metadata in zip(tables, tables_metadata):
|
||||
merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
|
||||
|
||||
if not self.store_full_path and "file_path" in bytestream.meta:
|
||||
file_path = bytestream.meta["file_path"]
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
|
||||
document = Document(content=table, meta=merged_metadata)
|
||||
documents.append(document)
|
||||
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Added the store_full_path init variable to XLSXToDocument to allow users to toggle whether to store the full path of the source file in the meta of the Document.
|
||||
This is set to False by default to increase privacy.
|
||||
@ -15,7 +15,7 @@ class TestXLSXToDocument:
|
||||
assert converter.table_format_kwargs == {}
|
||||
|
||||
def test_run_basic_tables(self, test_files_path) -> None:
|
||||
converter = XLSXToDocument()
|
||||
converter = XLSXToDocument(store_full_path=True)
|
||||
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
|
||||
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
|
||||
documents = results["documents"]
|
||||
@ -34,7 +34,7 @@ class TestXLSXToDocument:
|
||||
}
|
||||
|
||||
def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
|
||||
converter = XLSXToDocument()
|
||||
converter = XLSXToDocument(store_full_path=False)
|
||||
paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
|
||||
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
|
||||
documents = results["documents"]
|
||||
@ -42,12 +42,12 @@ class TestXLSXToDocument:
|
||||
assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
|
||||
assert documents[0].meta == {
|
||||
"date_added": "2022-01-01T00:00:00",
|
||||
"file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
|
||||
"file_path": "table_empty_rows_and_columns.xlsx",
|
||||
"xlsx": {"sheet_name": "Sheet1"},
|
||||
}
|
||||
|
||||
def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
|
||||
converter = XLSXToDocument()
|
||||
converter = XLSXToDocument(store_full_path=True)
|
||||
paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
|
||||
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
|
||||
documents = results["documents"]
|
||||
@ -63,7 +63,7 @@ class TestXLSXToDocument:
|
||||
}
|
||||
|
||||
def test_run_markdown(self, test_files_path) -> None:
|
||||
converter = XLSXToDocument(table_format="markdown")
|
||||
converter = XLSXToDocument(table_format="markdown", store_full_path=True)
|
||||
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
|
||||
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
|
||||
documents = results["documents"]
|
||||
@ -99,7 +99,7 @@ class TestXLSXToDocument:
|
||||
def test_run_sheet_name(
|
||||
self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
|
||||
) -> None:
|
||||
converter = XLSXToDocument(sheet_name=sheet_name)
|
||||
converter = XLSXToDocument(sheet_name=sheet_name, store_full_path=True)
|
||||
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
|
||||
results = converter.run(sources=paths)
|
||||
documents = results["documents"]
|
||||
@ -111,7 +111,7 @@ class TestXLSXToDocument:
|
||||
}
|
||||
|
||||
def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
|
||||
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
|
||||
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}, store_full_path=True)
|
||||
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
|
||||
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
|
||||
documents = results["documents"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user