mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 18:06:17 +00:00
feat!: HTMLToDocument
- allow choosing the boilerpy3 extractor (#6582)
* allow extractor customizability * release note * typo
This commit is contained in:
parent
dcf37c5173
commit
94cfe5d9ae
@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union, Literal
|
||||||
from boilerpy3 import extractors
|
from boilerpy3 import extractors
|
||||||
|
|
||||||
from haystack import Document, component
|
from haystack import Document, component
|
||||||
@ -28,6 +28,27 @@ class HTMLToDocument:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
extractor_type: Literal[
|
||||||
|
"DefaultExtractor",
|
||||||
|
"ArticleExtractor",
|
||||||
|
"ArticleSentencesExtractor",
|
||||||
|
"LargestContentExtractor",
|
||||||
|
"CanolaExtractor",
|
||||||
|
"KeepEverythingExtractor",
|
||||||
|
"NumWordsRulesExtractor",
|
||||||
|
] = "DefaultExtractor",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create an HTMLToDocument component.
|
||||||
|
|
||||||
|
:param extractor_type: The type of boilerpy3 extractor to use. Defaults to `DefaultExtractor`.
|
||||||
|
For more information on the different types of extractors,
|
||||||
|
see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
|
||||||
|
"""
|
||||||
|
self.extractor_type = extractor_type
|
||||||
|
|
||||||
@component.output_types(documents=List[Document])
|
@component.output_types(documents=List[Document])
|
||||||
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
||||||
"""
|
"""
|
||||||
@ -46,7 +67,8 @@ class HTMLToDocument:
|
|||||||
elif len(sources) != len(meta):
|
elif len(sources) != len(meta):
|
||||||
raise ValueError("The length of the metadata list must match the number of sources.")
|
raise ValueError("The length of the metadata list must match the number of sources.")
|
||||||
|
|
||||||
extractor = extractors.ArticleExtractor(raise_on_failure=False)
|
extractor_class = getattr(extractors, self.extractor_type)
|
||||||
|
extractor = extractor_class(raise_on_failure=False)
|
||||||
|
|
||||||
for source, metadata in zip(sources, meta):
|
for source, metadata in zip(sources, meta):
|
||||||
try:
|
try:
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
The `HTMLToDocument` converter now allows choosing the boilerpy3 extractor
|
||||||
|
to extract the content from the HTML document.
|
||||||
|
The default extractor has been changed to `DefaultExtractor`, which is better
|
||||||
|
for generic use cases than the previous default (`ArticleExtractor`).
|
@ -18,6 +18,20 @@ class TestHTMLToDocument:
|
|||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
assert "Haystack" in docs[0].content
|
assert "Haystack" in docs[0].content
|
||||||
|
|
||||||
|
def test_run_different_extractors(self, test_files_path):
|
||||||
|
"""
|
||||||
|
Test if the component runs correctly with different boilrepy3 extractors.
|
||||||
|
"""
|
||||||
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
||||||
|
|
||||||
|
converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
|
||||||
|
converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")
|
||||||
|
|
||||||
|
doc_article = converter_article.run(sources=sources)["documents"][0]
|
||||||
|
doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]
|
||||||
|
|
||||||
|
assert len(doc_keep_everything.content) > len(doc_article.content)
|
||||||
|
|
||||||
def test_run_doc_metadata(self, test_files_path):
|
def test_run_doc_metadata(self, test_files_path):
|
||||||
"""
|
"""
|
||||||
Test if the component runs correctly when metadata is supplied by the user.
|
Test if the component runs correctly when metadata is supplied by the user.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user