mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
feat: Update HTMLToDocument to handle ByteStream inputs (#6020)
* Update HTML converter * Add mixed source unit test * Update haystack/preview/components/file_converters/html.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
12fe0364dc
commit
1a6a8863e8
@ -2,69 +2,77 @@ import logging
|
||||
from typing import List, Optional, Dict, Any, Union
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import Document, component, default_to_dict, default_from_dict
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
|
||||
from boilerpy3 import extractors
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class HTMLToDocument:
|
||||
"""
|
||||
A component for converting an HTML file to a Document.
|
||||
Converts an HTML file to a Document.
|
||||
"""
|
||||
|
||||
def __init__(self, id_hash_keys: Optional[List[str]] = None):
|
||||
"""
|
||||
Create a HTMLToDocument component.
|
||||
Initializes the HTMLToDocument component.
|
||||
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. Default: `None`
|
||||
:param id_hash_keys: List of strings referencing the Document's attributes to generate its ID. Default: `None`
|
||||
"""
|
||||
boilerpy3_import.check()
|
||||
self.id_hash_keys = id_hash_keys or []
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
"""
|
||||
"""Serialize the component to a dictionary."""
|
||||
return default_to_dict(self, id_hash_keys=self.id_hash_keys)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
"""
|
||||
"""Deserialize the component from a dictionary."""
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, paths: List[Union[str, Path]]):
|
||||
def run(self, sources: List[Union[str, Path, ByteStream]]):
|
||||
"""
|
||||
Convert HTML files to Documents.
|
||||
Converts a list of HTML files to Documents.
|
||||
|
||||
:param paths: A list of paths to HTML files.
|
||||
:return: A list of Documents.
|
||||
:param sources: Paths to HTML files.
|
||||
:return: List of converted Documents.
|
||||
"""
|
||||
documents = []
|
||||
extractor = extractors.ArticleExtractor(raise_on_failure=False)
|
||||
for path in paths:
|
||||
for source in sources:
|
||||
try:
|
||||
file_content = extractor.read_from_file(path)
|
||||
file_content = self._extract_content(source)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
|
||||
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
|
||||
continue
|
||||
# although raise_on_failure is set to False, the extractor can still raise an exception
|
||||
try:
|
||||
text = extractor.get_content(file_content)
|
||||
except Exception as conversion_e:
|
||||
logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e)
|
||||
except Exception as conversion_e: # Consider specifying the expected exception type(s) here
|
||||
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
|
||||
continue
|
||||
|
||||
document = Document(text=text, id_hash_keys=self.id_hash_keys)
|
||||
documents.append(document)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
def _extract_content(self, source: Union[str, Path, ByteStream]) -> str:
|
||||
"""
|
||||
Extracts content from the given data source
|
||||
:param source: The data source to extract content from.
|
||||
:return: The extracted content.
|
||||
"""
|
||||
if isinstance(source, (str, Path)):
|
||||
with open(source) as text_file:
|
||||
return text_file.read()
|
||||
if isinstance(source, ByteStream):
|
||||
return source.data.decode("utf-8")
|
||||
|
||||
raise ValueError(f"Unsupported source type: {type(source)}")
|
||||
|
||||
@ -3,6 +3,7 @@ import logging
|
||||
import pytest
|
||||
|
||||
from haystack.preview.components.file_converters import HTMLToDocument
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
|
||||
|
||||
class TestHTMLToDocument:
|
||||
@ -31,7 +32,7 @@ class TestHTMLToDocument:
|
||||
"""
|
||||
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
|
||||
converter = HTMLToDocument()
|
||||
output = converter.run(paths=paths)
|
||||
output = converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "Haystack" in docs[0].text
|
||||
@ -44,7 +45,7 @@ class TestHTMLToDocument:
|
||||
paths = [preview_samples_path / "audio" / "answer.wav"]
|
||||
converter = HTMLToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
output = converter.run(paths=paths)
|
||||
output = converter.run(sources=paths)
|
||||
assert "codec can't decode byte" in caplog.text
|
||||
|
||||
docs = output["documents"]
|
||||
@ -58,6 +59,23 @@ class TestHTMLToDocument:
|
||||
paths = ["non_existing_file.html"]
|
||||
converter = HTMLToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
result = converter.run(paths=paths)
|
||||
assert "Could not read file non_existing_file.html" in caplog.text
|
||||
result = converter.run(sources=paths)
|
||||
assert "Could not read non_existing_file.html" in caplog.text
|
||||
assert result["documents"] == []
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_mixed_sources_run(self, preview_samples_path):
|
||||
"""
|
||||
Test if the component runs correctly if the input is a mix of paths and ByteStreams
|
||||
"""
|
||||
paths = [preview_samples_path / "html" / "what_is_haystack.html"]
|
||||
with open(preview_samples_path / "html" / "what_is_haystack.html", "rb") as f:
|
||||
byte_stream = f.read()
|
||||
paths.append(ByteStream(byte_stream))
|
||||
|
||||
converter = HTMLToDocument()
|
||||
output = converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 2
|
||||
for doc in docs:
|
||||
assert "Haystack" in doc.text
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user