mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 15:08:43 +00:00
feat: Add store_full_path to converters (1/3) (#8566)
* Add store_full_path param to 3 converters
This commit is contained in:
parent
0b2a299378
commit
21906d0558
@ -3,6 +3,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
@ -34,7 +36,7 @@ class CSVToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, encoding: str = "utf-8"):
|
||||
def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
|
||||
"""
|
||||
Creates a CSVToDocument component.
|
||||
|
||||
@ -42,8 +44,12 @@ class CSVToDocument:
|
||||
The encoding of the csv files to convert.
|
||||
If the encoding is specified in the metadata of a source ByteStream,
|
||||
it overrides this value.
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
self.encoding = encoding
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(
|
||||
@ -87,6 +93,19 @@ class CSVToDocument:
|
||||
continue
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
|
||||
warnings.warn(
|
||||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
|
||||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
|
||||
"storing only file names to improve privacy.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if not self.store_full_path and "file_path" in bytestream.meta:
|
||||
file_path = bytestream.meta.get("file_path")
|
||||
if file_path: # Ensure the value is not None for pylint
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
|
||||
document = Document(content=data, meta=merged_metadata)
|
||||
documents.append(document)
|
||||
|
||||
|
||||
@ -4,6 +4,8 @@
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from io import StringIO
|
||||
@ -107,15 +109,19 @@ class DOCXToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
|
||||
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
|
||||
"""
|
||||
Create a DOCXToDocument component.
|
||||
|
||||
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
|
||||
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
docx_import.check()
|
||||
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
@ -124,7 +130,7 @@ class DOCXToDocument:
|
||||
:returns:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(self, table_format=str(self.table_format))
|
||||
return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
|
||||
@ -158,7 +164,6 @@ class DOCXToDocument:
|
||||
If it's a list, the length of the list must match the number of sources, because the two lists will
|
||||
be zipped.
|
||||
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
|
||||
|
||||
:returns:
|
||||
A dictionary with the following keys:
|
||||
- `documents`: Created Documents
|
||||
@ -184,8 +189,21 @@ class DOCXToDocument:
|
||||
)
|
||||
continue
|
||||
|
||||
warnings.warn(
|
||||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
|
||||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
|
||||
"storing only file names to improve privacy.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
docx_metadata = self._get_docx_metadata(document=docx_document)
|
||||
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
|
||||
|
||||
if not self.store_full_path and "file_path" in bytestream.meta:
|
||||
file_path = bytestream.meta.get("file_path")
|
||||
if file_path: # Ensure the value is not None for pylint
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
|
||||
document = Document(content=text, meta=merged_metadata)
|
||||
documents.append(document)
|
||||
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
@ -33,17 +35,21 @@ class HTMLToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None):
|
||||
def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
|
||||
"""
|
||||
Create an HTMLToDocument component.
|
||||
|
||||
:param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
|
||||
are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
|
||||
the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
trafilatura_import.check()
|
||||
|
||||
self.extraction_kwargs = extraction_kwargs or {}
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
@ -52,7 +58,7 @@ class HTMLToDocument:
|
||||
:returns:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)
|
||||
return default_to_dict(self, extraction_kwargs=self.extraction_kwargs, store_full_path=self.store_full_path)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
|
||||
@ -115,7 +121,20 @@ class HTMLToDocument:
|
||||
)
|
||||
continue
|
||||
|
||||
document = Document(content=text, meta={**bytestream.meta, **metadata})
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
|
||||
warnings.warn(
|
||||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
|
||||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
|
||||
"storing only file names to improve privacy.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if not self.store_full_path and "file_path" in bytestream.meta:
|
||||
file_path = bytestream.meta.get("file_path")
|
||||
if file_path: # Ensure the value is not None for pylint
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
|
||||
document = Document(content=text, meta=merged_metadata)
|
||||
documents.append(document)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Added a new `store_full_path` parameter to the `__init__` method of `CSVToDocument`, `DOCXToDocument`, and `HTMLToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
|
||||
|
||||
deprecations:
|
||||
- |
|
||||
The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
|
||||
@ -39,6 +39,25 @@ class TestCSVToDocument:
|
||||
assert docs[1].meta["file_path"] == str(files[1])
|
||||
assert docs[2].meta["file_path"] == str(files[2])
|
||||
|
||||
def test_run_with_store_full_path_false(self, test_files_path):
|
||||
"""
|
||||
Test if the component runs correctly with store_full_path=False
|
||||
"""
|
||||
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
|
||||
bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
|
||||
bytestream.meta["key"] = "value"
|
||||
files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
|
||||
converter = CSVToDocument(store_full_path=False)
|
||||
output = converter.run(sources=files)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 3
|
||||
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
|
||||
assert isinstance(docs[0].content, str)
|
||||
assert docs[0].meta["file_path"] == "sample_1.csv"
|
||||
assert docs[0].meta["key"] == "value"
|
||||
assert docs[1].meta["file_path"] == "sample_2.csv"
|
||||
assert docs[2].meta["file_path"] == "sample_3.csv"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
|
||||
@ -32,7 +32,7 @@ class TestDOCXToDocument:
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": True, "table_format": "csv"},
|
||||
}
|
||||
|
||||
def test_to_dict_custom_parameters(self):
|
||||
@ -40,28 +40,28 @@ class TestDOCXToDocument:
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "markdown"},
|
||||
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format="csv")
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": True, "table_format": "csv"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "markdown"},
|
||||
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": True, "table_format": "csv"},
|
||||
}
|
||||
|
||||
def test_from_dict(self):
|
||||
@ -179,6 +179,38 @@ class TestDOCXToDocument:
|
||||
"Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
|
||||
), "Text after table not found"
|
||||
|
||||
def test_run_with_store_full_path_false(self, test_files_path):
|
||||
"""
|
||||
Test if the component runs correctly with store_full_path=False
|
||||
"""
|
||||
docx_converter = DOCXToDocument(store_full_path=False)
|
||||
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
||||
output = docx_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "History" in docs[0].content
|
||||
assert docs[0].meta.keys() == {"file_path", "docx"}
|
||||
assert docs[0].meta == {
|
||||
"file_path": "sample_docx_1.docx",
|
||||
"docx": DOCXMetadata(
|
||||
author="Microsoft Office User",
|
||||
category="",
|
||||
comments="",
|
||||
content_status="",
|
||||
created="2024-06-09T21:17:00+00:00",
|
||||
identifier="",
|
||||
keywords="",
|
||||
language="",
|
||||
last_modified_by="Carlos Fernández Lorán",
|
||||
last_printed=None,
|
||||
modified="2024-06-09T21:27:00+00:00",
|
||||
revision=2,
|
||||
subject="",
|
||||
title="",
|
||||
version="",
|
||||
),
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
|
||||
def test_table_between_two_paragraphs(self, test_files_path, table_format):
|
||||
docx_converter = DOCXToDocument(table_format=table_format)
|
||||
|
||||
@ -38,6 +38,28 @@ class TestHTMLToDocument:
|
||||
assert "Haystack" in docs[0].content
|
||||
assert docs[0].meta["file_name"] == "what_is_haystack.html"
|
||||
|
||||
def test_run_with_store_full_path(self, test_files_path):
|
||||
"""
|
||||
Test if the component runs correctly when metadata is supplied by the user.
|
||||
"""
|
||||
converter = HTMLToDocument()
|
||||
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
||||
|
||||
results = converter.run(sources=sources) # store_full_path is True by default
|
||||
docs = results["documents"]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert "Haystack" in docs[0].content
|
||||
assert docs[0].meta["file_path"] == str(sources[0])
|
||||
|
||||
converter_2 = HTMLToDocument(store_full_path=False)
|
||||
results = converter_2.run(sources=sources)
|
||||
docs = results["documents"]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert "Haystack" in docs[0].content
|
||||
assert docs[0].meta["file_path"] == "what_is_haystack.html"
|
||||
|
||||
def test_incorrect_meta(self, test_files_path):
|
||||
"""
|
||||
Test if the component raises an error when incorrect metadata is supplied by the user.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user