feat: support single metadata dictionary in TikaDocumentConverter (#6698)

* reno * converter * test * comment
2025-08-27 01:46:33 +00:00 · 2024-01-09 09:49:47 +01:00 · 2024-01-09 09:49:47 +01:00 · 974d65f30a
commit 974d65f30a
parent 93b2aaee09
3 changed files with 29 additions and 17 deletions
--- a/haystack/components/converters/tika.py
+++ b/haystack/components/converters/tika.py
@ -6,7 +6,7 @@ import io
 from haystack.lazy_imports import LazyImport
 from haystack import component, Document
 from haystack.dataclasses import ByteStream
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata


 with LazyImport("Run 'pip install tika'") as tika_import:
@ -31,7 +31,10 @@ class TikaDocumentConverter:
    from haystack.components.converters.tika import TikaDocumentConverter

    converter = TikaDocumentConverter()
-    results = converter.run(sources=["sample.docx", "my_document.rtf", "archive.zip"])
+    results = converter.run(
+        sources=["sample.docx", "my_document.rtf", "archive.zip"],
+        meta={"date_added": datetime.now().isoformat()}
+    )
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the docx file.'
@ -48,24 +51,26 @@ class TikaDocumentConverter:
        self.tika_url = tika_url

    @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
        """
        Convert files to Documents.

        :param sources: List of file paths or ByteStream objects.
-        :param meta: Optional list of metadata to attach to the Documents.
-          The length of the list must match the number of sources. Defaults to `None`.
+        :param meta: Optional metadata to attach to the Documents.
+          This value can be either a list of dictionaries or a single dictionary.
+          If it's a single dictionary, its content is added to the metadata of all produced Documents.
+          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+          Defaults to `None`.
        :return: A dictionary containing a list of Document objects under the 'documents' key.
        """
-
        documents = []
+        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

-        if meta is None:
-            meta = [{}] * len(sources)
-        elif len(sources) != len(meta):
-            raise ValueError("The length of the metadata list must match the number of sources.")
-
-        for source, metadata in zip(sources, meta):
+        for source, metadata in zip(sources, meta_list):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
--- a/releasenotes/notes/single-meta-in-tikaconverter-89b454c451a2ed93.yaml
+++ b/releasenotes/notes/single-meta-in-tikaconverter-89b454c451a2ed93.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Adds support for single metadata dictionary input in `TikaDocumentConverter`.
--- a/test/components/converters/test_tika_doc_converter.py
+++ b/test/components/converters/test_tika_doc_converter.py
@ -18,16 +18,19 @@ class TestTikaDocumentConverter:
        assert len(documents) == 1
        assert documents[0].content == "Content of mock source"

-    def test_run_with_meta(self):
+    def test_run_with_meta(self, test_files_path):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

        converter = TikaDocumentConverter()
        with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
-            output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
-        document = output["documents"][0]
+            output = converter.run(
+                sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"}
+            )

-        # check that the metadata from the bytestream is merged with that from the meta parameter
-        assert document.meta == {"author": "test_author", "language": "it"}
+        # check that the metadata from the sources is merged with that from the meta parameter
+        assert output["documents"][0].meta["author"] == "test_author"
+        assert output["documents"][0].meta["language"] == "it"
+        assert output["documents"][1].meta["language"] == "it"

    def test_run_nonexistent_file(self, caplog):
        component = TikaDocumentConverter()