diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index 71475aea5..859f62fed 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal from boilerpy3 import extractors from haystack import Document, component @@ -28,6 +28,27 @@ class HTMLToDocument: """ + def __init__( + self, + extractor_type: Literal[ + "DefaultExtractor", + "ArticleExtractor", + "ArticleSentencesExtractor", + "LargestContentExtractor", + "CanolaExtractor", + "KeepEverythingExtractor", + "NumWordsRulesExtractor", + ] = "DefaultExtractor", + ): + """ + Create an HTMLToDocument component. + + :param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`. + For more information on the different types of extractors, + see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors). + """ + self.extractor_type = extractor_type + @component.output_types(documents=List[Document]) def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): """ @@ -46,7 +67,8 @@ class HTMLToDocument: elif len(sources) != len(meta): raise ValueError("The length of the metadata list must match the number of sources.") - extractor = extractors.ArticleExtractor(raise_on_failure=False) + extractor_class = getattr(extractors, self.extractor_type) + extractor = extractor_class(raise_on_failure=False) for source, metadata in zip(sources, meta): try: diff --git a/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml b/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml new file mode 100644 index 000000000..a77b94750 --- /dev/null +++ b/releasenotes/notes/htmlconverter-allow-extractor-customizability-730ae129db17327a.yaml @@ -0,0 +1,7 @@ +--- +enhancements: + - | + The `HTMLToDocument` converter now allows choosing the boilerpy3 extractor + to extract the content from the HTML document. + The default extractor has been changed to `DefaultExtractor`, which is better + for generic use cases than the previous default (`ArticleExtractor`). diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index 437ab9e61..1cdb47812 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -18,6 +18,20 @@ class TestHTMLToDocument: assert len(docs) == 1 assert "Haystack" in docs[0].content + def test_run_different_extractors(self, test_files_path): + """ + Test if the component runs correctly with different boilrepy3 extractors. + """ + sources = [test_files_path / "html" / "what_is_haystack.html"] + + converter_article = HTMLToDocument(extractor_type="ArticleExtractor") + converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor") + + doc_article = converter_article.run(sources=sources)["documents"][0] + doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0] + + assert len(doc_keep_everything.content) > len(doc_article.content) + def test_run_doc_metadata(self, test_files_path): """ Test if the component runs correctly when metadata is supplied by the user.