diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index 83f0acd43..88230ce64 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -5,7 +5,7 @@ from boilerpy3 import extractors from haystack import Document, component from haystack.dataclasses import ByteStream -from haystack.components.converters.utils import get_bytestream_from_source +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata logger = logging.getLogger(__name__) @@ -55,22 +55,21 @@ class HTMLToDocument: Converts a list of HTML files to Documents. :param sources: List of HTML file paths or ByteStream objects. - :param meta: Optional list of metadata to attach to the Documents. - The length of the list must match the number of sources. Defaults to `None`. + :param meta: Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + Defaults to `None`. :return: A dictionary containing a list of Document objects under the 'documents' key. """ documents = [] - - if meta is None: - meta = [{}] * len(sources) - elif len(sources) != len(meta): - raise ValueError("The length of the metadata list must match the number of sources.") + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) extractor_class = getattr(extractors, self.extractor_type) extractor = extractor_class(raise_on_failure=False) - for source, metadata in zip(sources, meta): + for source, metadata in zip(sources, meta_list): try: bytestream = get_bytestream_from_source(source=source) except Exception as e: diff --git a/releasenotes/notes/single-meta-in-htm2document-199ea44a4ae5c02b.yaml b/releasenotes/notes/single-meta-in-htm2document-199ea44a4ae5c02b.yaml new file mode 100644 index 000000000..c4067ec60 --- /dev/null +++ b/releasenotes/notes/single-meta-in-htm2document-199ea44a4ae5c02b.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Adds support for single metadata dictionary input in `HTMLToDocument`. diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index dfd3cfd96..aa8df5119 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -13,10 +13,11 @@ class TestHTMLToDocument: """ sources = [test_files_path / "html" / "what_is_haystack.html"] converter = HTMLToDocument() - results = converter.run(sources=sources) + results = converter.run(sources=sources, meta={"test": "TEST"}) docs = results["documents"] assert len(docs) == 1 assert "Haystack" in docs[0].content + assert docs[0].meta["test"] == "TEST" def test_run_different_extractors(self, test_files_path): """