diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
index 71475aea5..859f62fed 100644
--- a/haystack/components/converters/html.py
+++ b/haystack/components/converters/html.py
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Literal
from boilerpy3 import extractors
from haystack import Document, component
@@ -28,6 +28,27 @@ class HTMLToDocument:
"""
+ def __init__(
+ self,
+ extractor_type: Literal[
+ "DefaultExtractor",
+ "ArticleExtractor",
+ "ArticleSentencesExtractor",
+ "LargestContentExtractor",
+ "CanolaExtractor",
+ "KeepEverythingExtractor",
+ "NumWordsRulesExtractor",
+ ] = "DefaultExtractor",
+ ):
+ """
+ Create an HTMLToDocument component.
+
+ :param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
+ For more information on the different types of extractors,
+ see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
+ """
+ self.extractor_type = extractor_type
+
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
@@ -46,7 +67,8 @@ class HTMLToDocument:
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
- extractor = extractors.ArticleExtractor(raise_on_failure=False)
+ extractor_class = getattr(extractors, self.extractor_type)
+ extractor = extractor_class(raise_on_failure=False)
for source, metadata in zip(sources, meta):
try:
diff --git a/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml b/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml
new file mode 100644
index 000000000..a77b94750
--- /dev/null
+++ b/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml
@@ -0,0 +1,7 @@
+---
+enhancements:
+ - |
+ The `HTMLToDocument` converter now allows choosing the boilerpy3 extractor
+ to extract the content from the HTML document.
+ The default extractor has been changed to `DefaultExtractor`, which is better
+ for generic use cases than the previous default (`ArticleExtractor`).
diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py
index 437ab9e61..1cdb47812 100644
--- a/test/components/converters/test_html_to_document.py
+++ b/test/components/converters/test_html_to_document.py
@@ -18,6 +18,20 @@ class TestHTMLToDocument:
assert len(docs) == 1
assert "Haystack" in docs[0].content
+ def test_run_different_extractors(self, test_files_path):
+ """
+ Test if the component runs correctly with different boilrepy3 extractors.
+ """
+ sources = [test_files_path / "html" / "what_is_haystack.html"]
+
+ converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
+ converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")
+
+ doc_article = converter_article.run(sources=sources)["documents"][0]
+ doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]
+
+ assert len(doc_keep_everything.content) > len(doc_article.content)
+
def test_run_doc_metadata(self, test_files_path):
"""
Test if the component runs correctly when metadata is supplied by the user.