mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 01:46:33 +00:00
feat: support single metadata dictionary in TikaDocumentConverter
(#6698)
* reno * converter * test * comment
This commit is contained in:
parent
93b2aaee09
commit
974d65f30a
@ -6,7 +6,7 @@ import io
|
|||||||
from haystack.lazy_imports import LazyImport
|
from haystack.lazy_imports import LazyImport
|
||||||
from haystack import component, Document
|
from haystack import component, Document
|
||||||
from haystack.dataclasses import ByteStream
|
from haystack.dataclasses import ByteStream
|
||||||
from haystack.components.converters.utils import get_bytestream_from_source
|
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
||||||
|
|
||||||
|
|
||||||
with LazyImport("Run 'pip install tika'") as tika_import:
|
with LazyImport("Run 'pip install tika'") as tika_import:
|
||||||
@ -31,7 +31,10 @@ class TikaDocumentConverter:
|
|||||||
from haystack.components.converters.tika import TikaDocumentConverter
|
from haystack.components.converters.tika import TikaDocumentConverter
|
||||||
|
|
||||||
converter = TikaDocumentConverter()
|
converter = TikaDocumentConverter()
|
||||||
results = converter.run(sources=["sample.docx", "my_document.rtf", "archive.zip"])
|
results = converter.run(
|
||||||
|
sources=["sample.docx", "my_document.rtf", "archive.zip"],
|
||||||
|
meta={"date_added": datetime.now().isoformat()}
|
||||||
|
)
|
||||||
documents = results["documents"]
|
documents = results["documents"]
|
||||||
print(documents[0].content)
|
print(documents[0].content)
|
||||||
# 'This is a text from the docx file.'
|
# 'This is a text from the docx file.'
|
||||||
@ -48,24 +51,26 @@ class TikaDocumentConverter:
|
|||||||
self.tika_url = tika_url
|
self.tika_url = tika_url
|
||||||
|
|
||||||
@component.output_types(documents=List[Document])
|
@component.output_types(documents=List[Document])
|
||||||
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
def run(
|
||||||
|
self,
|
||||||
|
sources: List[Union[str, Path, ByteStream]],
|
||||||
|
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Convert files to Documents.
|
Convert files to Documents.
|
||||||
|
|
||||||
:param sources: List of file paths or ByteStream objects.
|
:param sources: List of file paths or ByteStream objects.
|
||||||
:param meta: Optional list of metadata to attach to the Documents.
|
:param meta: Optional metadata to attach to the Documents.
|
||||||
The length of the list must match the number of sources. Defaults to `None`.
|
This value can be either a list of dictionaries or a single dictionary.
|
||||||
|
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
||||||
|
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
||||||
|
Defaults to `None`.
|
||||||
:return: A dictionary containing a list of Document objects under the 'documents' key.
|
:return: A dictionary containing a list of Document objects under the 'documents' key.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
documents = []
|
documents = []
|
||||||
|
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
|
||||||
|
|
||||||
if meta is None:
|
for source, metadata in zip(sources, meta_list):
|
||||||
meta = [{}] * len(sources)
|
|
||||||
elif len(sources) != len(meta):
|
|
||||||
raise ValueError("The length of the metadata list must match the number of sources.")
|
|
||||||
|
|
||||||
for source, metadata in zip(sources, meta):
|
|
||||||
try:
|
try:
|
||||||
bytestream = get_bytestream_from_source(source)
|
bytestream = get_bytestream_from_source(source)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Adds support for single metadata dictionary input in `TikaDocumentConverter`.
|
@ -18,16 +18,19 @@ class TestTikaDocumentConverter:
|
|||||||
assert len(documents) == 1
|
assert len(documents) == 1
|
||||||
assert documents[0].content == "Content of mock source"
|
assert documents[0].content == "Content of mock source"
|
||||||
|
|
||||||
def test_run_with_meta(self):
|
def test_run_with_meta(self, test_files_path):
|
||||||
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
||||||
|
|
||||||
converter = TikaDocumentConverter()
|
converter = TikaDocumentConverter()
|
||||||
with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
|
with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
|
||||||
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
|
output = converter.run(
|
||||||
document = output["documents"][0]
|
sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"}
|
||||||
|
)
|
||||||
|
|
||||||
# check that the metadata from the bytestream is merged with that from the meta parameter
|
# check that the metadata from the sources is merged with that from the meta parameter
|
||||||
assert document.meta == {"author": "test_author", "language": "it"}
|
assert output["documents"][0].meta["author"] == "test_author"
|
||||||
|
assert output["documents"][0].meta["language"] == "it"
|
||||||
|
assert output["documents"][1].meta["language"] == "it"
|
||||||
|
|
||||||
def test_run_nonexistent_file(self, caplog):
|
def test_run_nonexistent_file(self, caplog):
|
||||||
component = TikaDocumentConverter()
|
component = TikaDocumentConverter()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user