feat: Add HTMLToDocument component (v2) (#5907)

This commit is contained in:
Vladimir Blagojevic 2023-09-28 17:22:28 +02:00 committed by GitHub
parent dfa48eece9
commit e882a7d5c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1780 additions and 1 deletions

View File

@ -1,5 +1,13 @@
from haystack.preview.components.file_converters.txt import TextFileToDocument
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
from haystack.preview.components.file_converters.html import HTMLToDocument
__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"]
__all__ = [
"TextFileToDocument",
"TikaDocumentConverter",
"AzureOCRDocumentConverter",
"PyPDFToDocument",
"HTMLToDocument",
]

View File

@ -0,0 +1,70 @@
import logging
from typing import List, Optional, Dict, Any, Union
from pathlib import Path
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import Document, component, default_to_dict, default_from_dict
with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
from boilerpy3 import extractors
logger = logging.getLogger(__name__)
@component
class HTMLToDocument:
"""
A component for converting an HTML file to a Document.
"""
def __init__(self, id_hash_keys: Optional[List[str]] = None):
"""
Create a HTMLToDocument component.
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. Default: `None`
"""
boilerpy3_import.check()
self.id_hash_keys = id_hash_keys or []
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, id_hash_keys=self.id_hash_keys)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)
@component.output_types(documents=List[Document])
def run(self, paths: List[Union[str, Path]]):
"""
Convert HTML files to Documents.
:param paths: A list of paths to HTML files.
:return: A list of Documents.
"""
documents = []
extractor = extractors.ArticleExtractor(raise_on_failure=False)
for path in paths:
try:
file_content = extractor.read_from_file(path)
except Exception as e:
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
continue
# although raise_on_failure is set to False, the extractor can still raise an exception
try:
text = extractor.get_content(file_content)
except Exception as conversion_e:
logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e)
continue
document = Document(text=text, id_hash_keys=self.id_hash_keys)
documents.append(document)
return {"documents": documents}

View File

@ -0,0 +1,4 @@
---
preview:
- |
Adds HTMLToDocument component to convert HTML to a Document.

View File

@ -0,0 +1,63 @@
import logging
import pytest
from haystack.preview.components.file_converters import HTMLToDocument
class TestHTMLToDocument:
@pytest.mark.unit
def test_to_dict(self):
component = HTMLToDocument()
data = component.to_dict()
assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}}
@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
component = HTMLToDocument(id_hash_keys=["name"])
data = component.to_dict()
assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
@pytest.mark.unit
def test_from_dict(self):
data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
component = HTMLToDocument.from_dict(data)
assert component.id_hash_keys == ["name"]
@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
converter = HTMLToDocument()
output = converter.run(paths=paths)
docs = output["documents"]
assert len(docs) == 1
assert "Haystack" in docs[0].text
@pytest.mark.unit
def test_run_wrong_file_type(self, preview_samples_path, caplog):
"""
Test if the component runs correctly when an input file is not of the expected type.
"""
paths = [preview_samples_path / "audio" / "answer.wav"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(paths=paths)
assert "codec can't decode byte" in caplog.text
docs = output["documents"]
assert docs == []
@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.html"]
converter = HTMLToDocument()
with caplog.at_level(logging.WARNING):
result = converter.run(paths=paths)
assert "Could not read file non_existing_file.html" in caplog.text
assert result["documents"] == []

File diff suppressed because it is too large Load Diff