feat: Add HTMLToDocument component (v2) (#5907)

2026-01-05 19:47:45 +00:00 · 2023-09-28 17:22:28 +02:00 · 2023-09-28 17:22:28 +02:00 · e882a7d5c8
commit e882a7d5c8
parent dfa48eece9
5 changed files with 1780 additions and 1 deletions
--- a/haystack/preview/components/file_converters/init.py
+++ b/haystack/preview/components/file_converters/init.py
@ -1,5 +1,13 @@
 from haystack.preview.components.file_converters.txt import TextFileToDocument
 from haystack.preview.components.file_converters.tika import TikaDocumentConverter
 from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
+from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
+from haystack.preview.components.file_converters.html import HTMLToDocument

-__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"]
+__all__ = [
+    "TextFileToDocument",
+    "TikaDocumentConverter",
+    "AzureOCRDocumentConverter",
+    "PyPDFToDocument",
+    "HTMLToDocument",
+]
--- a/haystack/preview/components/file_converters/html.py
+++ b/haystack/preview/components/file_converters/html.py
@ -0,0 +1,70 @@
+import logging
+from typing import List, Optional, Dict, Any, Union
+from pathlib import Path
+
+from haystack.preview.lazy_imports import LazyImport
+from haystack.preview import Document, component, default_to_dict, default_from_dict
+
+with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
+    from boilerpy3 import extractors
+
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class HTMLToDocument:
+    """
+    A component for converting an HTML file to a Document.
+    """
+
+    def __init__(self, id_hash_keys: Optional[List[str]] = None):
+        """
+        Create a HTMLToDocument component.
+
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. Default: `None`
+        """
+        boilerpy3_import.check()
+        self.id_hash_keys = id_hash_keys or []
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(self, id_hash_keys=self.id_hash_keys)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=List[Document])
+    def run(self, paths: List[Union[str, Path]]):
+        """
+        Convert HTML files to Documents.
+
+        :param paths: A list of paths to HTML files.
+        :return: A list of Documents.
+        """
+        documents = []
+        extractor = extractors.ArticleExtractor(raise_on_failure=False)
+        for path in paths:
+            try:
+                file_content = extractor.read_from_file(path)
+            except Exception as e:
+                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
+                continue
+            # although raise_on_failure is set to False, the extractor can still raise an exception
+            try:
+                text = extractor.get_content(file_content)
+            except Exception as conversion_e:
+                logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e)
+                continue
+
+            document = Document(text=text, id_hash_keys=self.id_hash_keys)
+            documents.append(document)
+
+        return {"documents": documents}
--- a/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml
+++ b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml
@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Adds HTMLToDocument component to convert HTML to a Document.
--- a/test/preview/components/file_converters/test_html_to_document.py
+++ b/test/preview/components/file_converters/test_html_to_document.py
@ -0,0 +1,63 @@
+import logging
+
+import pytest
+
+from haystack.preview.components.file_converters import HTMLToDocument
+
+
+class TestHTMLToDocument:
+    @pytest.mark.unit
+    def test_to_dict(self):
+        component = HTMLToDocument()
+        data = component.to_dict()
+        assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}}
+
+    @pytest.mark.unit
+    def test_to_dict_with_custom_init_parameters(self):
+        component = HTMLToDocument(id_hash_keys=["name"])
+        data = component.to_dict()
+        assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
+        component = HTMLToDocument.from_dict(data)
+        assert component.id_hash_keys == ["name"]
+
+    @pytest.mark.unit
+    def test_run(self, preview_samples_path):
+        """
+        Test if the component runs correctly.
+        """
+        paths = [preview_samples_path / "html" / "what_is_haystack.html"]
+        converter = HTMLToDocument()
+        output = converter.run(paths=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].text
+
+    @pytest.mark.unit
+    def test_run_wrong_file_type(self, preview_samples_path, caplog):
+        """
+        Test if the component runs correctly when an input file is not of the expected type.
+        """
+        paths = [preview_samples_path / "audio" / "answer.wav"]
+        converter = HTMLToDocument()
+        with caplog.at_level(logging.WARNING):
+            output = converter.run(paths=paths)
+            assert "codec can't decode byte" in caplog.text
+
+        docs = output["documents"]
+        assert docs == []
+
+    @pytest.mark.unit
+    def test_run_error_handling(self, preview_samples_path, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = ["non_existing_file.html"]
+        converter = HTMLToDocument()
+        with caplog.at_level(logging.WARNING):
+            result = converter.run(paths=paths)
+            assert "Could not read file non_existing_file.html" in caplog.text
+            assert result["documents"] == []
--- a/test/preview/test_files/html/what_is_haystack.html
+++ b/test/preview/test_files/html/what_is_haystack.html