feat!: HTMLToDocument - allow choosing the boilerpy3 extractor (#6582)

* allow extractor customizability

* release note

* typo
This commit is contained in:
Stefano Fiorucci 2023-12-19 10:52:12 +01:00 committed by GitHub
parent dcf37c5173
commit 94cfe5d9ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 2 deletions

View File

@ -1,6 +1,6 @@
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union, Literal
from boilerpy3 import extractors from boilerpy3 import extractors
from haystack import Document, component from haystack import Document, component
@ -28,6 +28,27 @@ class HTMLToDocument:
""" """
def __init__(
self,
extractor_type: Literal[
"DefaultExtractor",
"ArticleExtractor",
"ArticleSentencesExtractor",
"LargestContentExtractor",
"CanolaExtractor",
"KeepEverythingExtractor",
"NumWordsRulesExtractor",
] = "DefaultExtractor",
):
"""
Create an HTMLToDocument component.
:param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
For more information on the different types of extractors,
see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
"""
self.extractor_type = extractor_type
@component.output_types(documents=List[Document]) @component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
""" """
@ -46,7 +67,8 @@ class HTMLToDocument:
elif len(sources) != len(meta): elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.") raise ValueError("The length of the metadata list must match the number of sources.")
extractor = extractors.ArticleExtractor(raise_on_failure=False) extractor_class = getattr(extractors, self.extractor_type)
extractor = extractor_class(raise_on_failure=False)
for source, metadata in zip(sources, meta): for source, metadata in zip(sources, meta):
try: try:

View File

@ -0,0 +1,7 @@
---
enhancements:
- |
The `HTMLToDocument` converter now allows choosing the boilerpy3 extractor
to extract the content from the HTML document.
The default extractor has been changed to `DefaultExtractor`, which is better
for generic use cases than the previous default (`ArticleExtractor`).

View File

@ -18,6 +18,20 @@ class TestHTMLToDocument:
assert len(docs) == 1 assert len(docs) == 1
assert "Haystack" in docs[0].content assert "Haystack" in docs[0].content
def test_run_different_extractors(self, test_files_path):
"""
Test if the component runs correctly with different boilrepy3 extractors.
"""
sources = [test_files_path / "html" / "what_is_haystack.html"]
converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")
doc_article = converter_article.run(sources=sources)["documents"][0]
doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]
assert len(doc_keep_everything.content) > len(doc_article.content)
def test_run_doc_metadata(self, test_files_path): def test_run_doc_metadata(self, test_files_path):
""" """
Test if the component runs correctly when metadata is supplied by the user. Test if the component runs correctly when metadata is supplied by the user.