feat: Update HTMLToDocument to handle ByteStream inputs (#6020)

* Update HTML converter * Add mixed source unit test * Update haystack/preview/components/file_converters/html.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
2026-01-06 03:57:19 +00:00 · 2023-10-11 10:15:58 +02:00 · 2023-10-11 10:15:58 +02:00 · 1a6a8863e8
commit 1a6a8863e8
parent 12fe0364dc
2 changed files with 54 additions and 28 deletions
--- a/haystack/preview/components/file_converters/html.py
+++ b/haystack/preview/components/file_converters/html.py
@ -2,69 +2,77 @@ import logging
 from typing import List, Optional, Dict, Any, Union
 from pathlib import Path

-from haystack.preview.lazy_imports import LazyImport
 from haystack.preview import Document, component, default_to_dict, default_from_dict
+from haystack.preview.dataclasses import ByteStream
+from haystack.preview.lazy_imports import LazyImport
+
+logger = logging.getLogger(__name__)

 with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
    from boilerpy3 import extractors


-logger = logging.getLogger(__name__)
-
-
@component
 class HTMLToDocument:
    """
-    A component for converting an HTML file to a Document.
+    Converts an HTML file to a Document.
    """

    def __init__(self, id_hash_keys: Optional[List[str]] = None):
        """
-        Create a HTMLToDocument component.
+        Initializes the HTMLToDocument component.

-        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
-            attributes. Default: `None`
+        :param id_hash_keys: List of strings referencing the Document's attributes to generate its ID. Default: `None`
        """
        boilerpy3_import.check()
        self.id_hash_keys = id_hash_keys or []

    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serialize this component to a dictionary.
-        """
+        """Serialize the component to a dictionary."""
        return default_to_dict(self, id_hash_keys=self.id_hash_keys)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
-        """
-        Deserialize this component from a dictionary.
-        """
+        """Deserialize the component from a dictionary."""
        return default_from_dict(cls, data)

    @component.output_types(documents=List[Document])
-    def run(self, paths: List[Union[str, Path]]):
+    def run(self, sources: List[Union[str, Path, ByteStream]]):
        """
-        Convert HTML files to Documents.
+        Converts a list of HTML files to Documents.

-        :param paths: A list of paths to HTML files.
-        :return: A list of Documents.
+        :param sources: Paths to HTML files.
+        :return: List of converted Documents.
        """
        documents = []
        extractor = extractors.ArticleExtractor(raise_on_failure=False)
-        for path in paths:
+        for source in sources:
            try:
-                file_content = extractor.read_from_file(path)
+                file_content = self._extract_content(source)
            except Exception as e:
-                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
+                logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
                continue
-            # although raise_on_failure is set to False, the extractor can still raise an exception
            try:
                text = extractor.get_content(file_content)
-            except Exception as conversion_e:
-                logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e)
+            except Exception as conversion_e:  # Consider specifying the expected exception type(s) here
+                logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
                continue

            document = Document(text=text, id_hash_keys=self.id_hash_keys)
            documents.append(document)

        return {"documents": documents}
+
+    def _extract_content(self, source: Union[str, Path, ByteStream]) -> str:
+        """
+        Extracts content from the given data source
+        :param source: The data source to extract content from.
+        :return: The extracted content.
+        """
+        if isinstance(source, (str, Path)):
+            with open(source) as text_file:
+                return text_file.read()
+        if isinstance(source, ByteStream):
+            return source.data.decode("utf-8")
+
+        raise ValueError(f"Unsupported source type: {type(source)}")
--- a/test/preview/components/file_converters/test_html_to_document.py
+++ b/test/preview/components/file_converters/test_html_to_document.py
@ -3,6 +3,7 @@ import logging
 import pytest

 from haystack.preview.components.file_converters import HTMLToDocument
+from haystack.preview.dataclasses import ByteStream


 class TestHTMLToDocument:
@ -31,7 +32,7 @@ class TestHTMLToDocument:
        """
        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
        converter = HTMLToDocument()
-        output = converter.run(paths=paths)
+        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "Haystack" in docs[0].text
@ -44,7 +45,7 @@ class TestHTMLToDocument:
        paths = [preview_samples_path / "audio" / "answer.wav"]
        converter = HTMLToDocument()
        with caplog.at_level(logging.WARNING):
-            output = converter.run(paths=paths)
+            output = converter.run(sources=paths)
            assert "codec can't decode byte" in caplog.text

        docs = output["documents"]
@ -58,6 +59,23 @@ class TestHTMLToDocument:
        paths = ["non_existing_file.html"]
        converter = HTMLToDocument()
        with caplog.at_level(logging.WARNING):
-            result = converter.run(paths=paths)
-            assert "Could not read file non_existing_file.html" in caplog.text
+            result = converter.run(sources=paths)
+            assert "Could not read non_existing_file.html" in caplog.text
            assert result["documents"] == []
+
+    @pytest.mark.unit
+    def test_mixed_sources_run(self, preview_samples_path):
+        """
+        Test if the component runs correctly if the input is a mix of paths and ByteStreams
+        """
+        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
+        with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f:
+            byte_stream = f.read()
+            paths.append(ByteStream(byte_stream))
+
+        converter = HTMLToDocument()
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 2
+        for doc in docs:
+            assert "Haystack" in doc.text