feat: support single metadata dictionary in HTMLToDocument (#6613)

* support single metadata in HTMLToDocument

* reno

* docstring
This commit is contained in:
ZanSara 2023-12-21 15:45:31 +00:00 committed by GitHub
parent 4d08be0c2a
commit ff55985e2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 10 deletions

View File

@ -5,7 +5,7 @@ from boilerpy3 import extractors
from haystack import Document, component
from haystack.dataclasses import ByteStream
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
logger = logging.getLogger(__name__)
@ -55,22 +55,21 @@ class HTMLToDocument:
Converts a list of HTML files to Documents.
:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []
if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
extractor_class = getattr(extractors, self.extractor_type)
extractor = extractor_class(raise_on_failure=False)
for source, metadata in zip(sources, meta):
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Adds support for single metadata dictionary input in `HTMLToDocument`.

View File

@ -13,10 +13,11 @@ class TestHTMLToDocument:
"""
sources = [test_files_path / "html" / "what_is_haystack.html"]
converter = HTMLToDocument()
results = converter.run(sources=sources)
results = converter.run(sources=sources, meta={"test": "TEST"})
docs = results["documents"]
assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta["test"] == "TEST"
def test_run_different_extractors(self, test_files_path):
"""