feat!: HTMLToDocument - allow choosing the boilerpy3 extractor (#6582)

* allow extractor customizability * release note * typo
2025-12-18 10:38:36 +00:00 · 2023-12-19 10:52:12 +01:00 · 2023-12-19 10:52:12 +01:00 · 94cfe5d9ae
commit 94cfe5d9ae
parent dcf37c5173
3 changed files with 45 additions and 2 deletions
--- a/haystack/components/converters/html.py
+++ b/haystack/components/converters/html.py
@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Literal
 from boilerpy3 import extractors

 from haystack import Document, component
@ -28,6 +28,27 @@ class HTMLToDocument:

    """

+    def __init__(
+        self,
+        extractor_type: Literal[
+            "DefaultExtractor",
+            "ArticleExtractor",
+            "ArticleSentencesExtractor",
+            "LargestContentExtractor",
+            "CanolaExtractor",
+            "KeepEverythingExtractor",
+            "NumWordsRulesExtractor",
+        ] = "DefaultExtractor",
+    ):
+        """
+        Create an HTMLToDocument component.
+
+        :param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
+          For more information on the different types of extractors,
+          see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
+        """
+        self.extractor_type = extractor_type
+
    @component.output_types(documents=List[Document])
    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
        """
@ -46,7 +67,8 @@ class HTMLToDocument:
        elif len(sources) != len(meta):
            raise ValueError("The length of the metadata list must match the number of sources.")

-        extractor = extractors.ArticleExtractor(raise_on_failure=False)
+        extractor_class = getattr(extractors, self.extractor_type)
+        extractor = extractor_class(raise_on_failure=False)

        for source, metadata in zip(sources, meta):
            try:
--- a/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml
+++ b/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml
@ -0,0 +1,7 @@
+---
+enhancements:
+  - |
+    The `HTMLToDocument` converter now allows choosing the boilerpy3 extractor
+    to extract the content from the HTML document.
+    The default extractor has been changed to `DefaultExtractor`, which is better
+    for generic use cases than the previous default (`ArticleExtractor`).
--- a/test/components/converters/test_html_to_document.py
+++ b/test/components/converters/test_html_to_document.py
@ -18,6 +18,20 @@ class TestHTMLToDocument:
        assert len(docs) == 1
        assert "Haystack" in docs[0].content

+    def test_run_different_extractors(self, test_files_path):
+        """
+        Test if the component runs correctly with different boilrepy3 extractors.
+        """
+        sources = [test_files_path / "html" / "what_is_haystack.html"]
+
+        converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
+        converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")
+
+        doc_article = converter_article.run(sources=sources)["documents"][0]
+        doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]
+
+        assert len(doc_keep_everything.content) > len(doc_article.content)
+
    def test_run_doc_metadata(self, test_files_path):
        """
        Test if the component runs correctly when metadata is supplied by the user.