mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
feat: add support for single meta dict in TextFileToDocument (#6606)
* add support for single meta dict * reno * reno * mypy * extract to function * docstring * mypy
This commit is contained in:
parent
7cc6080dfa
commit
cf79aa1485
@ -4,7 +4,7 @@ from typing import List, Union, Dict, Any, Optional
|
||||
|
||||
from haystack import Document, component
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.components.converters.utils import get_bytestream_from_source
|
||||
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -38,25 +38,29 @@ class TextFileToDocument:
|
||||
self.encoding = encoding
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
||||
def run(
|
||||
self,
|
||||
sources: List[Union[str, Path, ByteStream]],
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
):
|
||||
"""
|
||||
Convert text files to Documents.
|
||||
|
||||
:param sources: A list of paths to text files or ByteStream objects.
|
||||
Note that if an encoding is specified in the metadata of a ByteStream,
|
||||
it will override the component's default.
|
||||
:param meta: Optional list of metadata to attach to the Documents.
|
||||
The length of the list must match the number of sources. Defaults to `None`.
|
||||
:param meta: Optional metadata to attach to the Documents.
|
||||
This value can be either a list of dictionaries or a single dictionary.
|
||||
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
||||
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
||||
Defaults to `None`.
|
||||
:return: A dictionary containing a list of Document objects under the 'documents' key.
|
||||
"""
|
||||
documents = []
|
||||
|
||||
if meta is None:
|
||||
meta = [{}] * len(sources)
|
||||
elif len(sources) != len(meta):
|
||||
raise ValueError("The length of the metadata list must match the number of sources.")
|
||||
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
||||
|
||||
for source, metadata in zip(sources, meta):
|
||||
for source, metadata in zip(sources, meta_list):
|
||||
try:
|
||||
bytestream = get_bytestream_from_source(source)
|
||||
except Exception as e:
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from typing import List, Union, Dict, Any, Optional
|
||||
|
||||
from haystack.dataclasses import ByteStream
|
||||
|
||||
@ -18,3 +18,25 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStre
|
||||
bs.meta["file_path"] = str(source)
|
||||
return bs
|
||||
raise ValueError(f"Unsupported source type {type(source)}")
|
||||
|
||||
|
||||
def normalize_metadata(
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Given all the possible value of the meta input for a converter (None, dictionary or list of dicts),
|
||||
makes sure to return a list of dictionaries of the correct length for the converter to use.
|
||||
|
||||
:param meta: the meta input of the converter, as-is
|
||||
:sources_count: the number of sources the converter received
|
||||
:returns: a list of dictionaries of the make length as the sources list
|
||||
"""
|
||||
if meta is None:
|
||||
return [{}] * sources_count
|
||||
if isinstance(meta, dict):
|
||||
return [meta] * sources_count
|
||||
if isinstance(meta, list):
|
||||
if sources_count != len(meta):
|
||||
raise ValueError("The length of the metadata list must match the number of sources.")
|
||||
return meta
|
||||
raise ValueError("meta must be either None, a dictionary or a list of dictionaries.")
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Adds support for single metadata dictionary input in `TextFileToDocument``.
|
||||
29
test/components/converters/test_utils.py
Normal file
29
test/components/converters/test_utils.py
Normal file
@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
from haystack.components.converters.utils import normalize_metadata
|
||||
|
||||
|
||||
def test_normalize_metadata_None():
|
||||
assert normalize_metadata(None, sources_count=1) == [{}]
|
||||
assert normalize_metadata(None, sources_count=3) == [{}, {}, {}]
|
||||
|
||||
|
||||
def test_normalize_metadata_single_dict():
|
||||
assert normalize_metadata({"a": 1}, sources_count=1) == [{"a": 1}]
|
||||
assert normalize_metadata({"a": 1}, sources_count=3) == [{"a": 1}, {"a": 1}, {"a": 1}]
|
||||
|
||||
|
||||
def test_normalize_metadata_list_of_right_size():
|
||||
assert normalize_metadata([{"a": 1}], sources_count=1) == [{"a": 1}]
|
||||
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=3) == [{"a": 1}, {"b": 2}, {"c": 3}]
|
||||
|
||||
|
||||
def test_normalize_metadata_list_of_wrong_size():
|
||||
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
|
||||
normalize_metadata([{"a": 1}], sources_count=3)
|
||||
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
|
||||
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=1)
|
||||
|
||||
|
||||
def test_normalize_metadata_other_type():
|
||||
with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
|
||||
normalize_metadata(({"a": 1},), sources_count=1)
|
||||
Loading…
x
Reference in New Issue
Block a user