mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-01 20:33:48 +00:00
feat: support single metadata dictionary in HTMLToDocument
(#6613)
* support single metadata in HTMLToDocument * reno * docstring
This commit is contained in:
parent
4d08be0c2a
commit
ff55985e2d
@ -5,7 +5,7 @@ from boilerpy3 import extractors
|
|||||||
|
|
||||||
from haystack import Document, component
|
from haystack import Document, component
|
||||||
from haystack.dataclasses import ByteStream
|
from haystack.dataclasses import ByteStream
|
||||||
from haystack.components.converters.utils import get_bytestream_from_source
|
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -55,22 +55,21 @@ class HTMLToDocument:
|
|||||||
Converts a list of HTML files to Documents.
|
Converts a list of HTML files to Documents.
|
||||||
|
|
||||||
:param sources: List of HTML file paths or ByteStream objects.
|
:param sources: List of HTML file paths or ByteStream objects.
|
||||||
:param meta: Optional list of metadata to attach to the Documents.
|
:param meta: Optional metadata to attach to the Documents.
|
||||||
The length of the list must match the number of sources. Defaults to `None`.
|
This value can be either a list of dictionaries or a single dictionary.
|
||||||
|
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
||||||
|
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
||||||
|
Defaults to `None`.
|
||||||
:return: A dictionary containing a list of Document objects under the 'documents' key.
|
:return: A dictionary containing a list of Document objects under the 'documents' key.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
documents = []
|
documents = []
|
||||||
|
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
|
||||||
if meta is None:
|
|
||||||
meta = [{}] * len(sources)
|
|
||||||
elif len(sources) != len(meta):
|
|
||||||
raise ValueError("The length of the metadata list must match the number of sources.")
|
|
||||||
|
|
||||||
extractor_class = getattr(extractors, self.extractor_type)
|
extractor_class = getattr(extractors, self.extractor_type)
|
||||||
extractor = extractor_class(raise_on_failure=False)
|
extractor = extractor_class(raise_on_failure=False)
|
||||||
|
|
||||||
for source, metadata in zip(sources, meta):
|
for source, metadata in zip(sources, meta_list):
|
||||||
try:
|
try:
|
||||||
bytestream = get_bytestream_from_source(source=source)
|
bytestream = get_bytestream_from_source(source=source)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Adds support for single metadata dictionary input in `HTMLToDocument`.
|
@ -13,10 +13,11 @@ class TestHTMLToDocument:
|
|||||||
"""
|
"""
|
||||||
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
||||||
converter = HTMLToDocument()
|
converter = HTMLToDocument()
|
||||||
results = converter.run(sources=sources)
|
results = converter.run(sources=sources, meta={"test": "TEST"})
|
||||||
docs = results["documents"]
|
docs = results["documents"]
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
assert "Haystack" in docs[0].content
|
assert "Haystack" in docs[0].content
|
||||||
|
assert docs[0].meta["test"] == "TEST"
|
||||||
|
|
||||||
def test_run_different_extractors(self, test_files_path):
|
def test_run_different_extractors(self, test_files_path):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user