2024-05-09 15:40:36 +02:00
|
|
|
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
|
|
#
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-05-26 17:22:51 +01:00
|
|
|
|
2023-09-28 17:22:28 +02:00
|
|
|
import logging
|
2024-04-05 16:02:34 +02:00
|
|
|
from pathlib import Path
|
2023-09-28 17:22:28 +02:00
|
|
|
|
|
|
|
import pytest
|
2024-07-31 10:59:53 +02:00
|
|
|
from unittest.mock import patch
|
2023-09-28 17:22:28 +02:00
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
from haystack.components.converters import HTMLToDocument
|
|
|
|
from haystack.dataclasses import ByteStream
|
2023-09-28 17:22:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestHTMLToDocument:
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run(self, test_files_path):
|
2023-09-28 17:22:28 +02:00
|
|
|
"""
|
|
|
|
Test if the component runs correctly.
|
|
|
|
"""
|
2023-11-24 14:48:43 +01:00
|
|
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
2023-09-28 17:22:28 +02:00
|
|
|
converter = HTMLToDocument()
|
2023-12-21 15:45:31 +00:00
|
|
|
results = converter.run(sources=sources, meta={"test": "TEST"})
|
2023-11-22 02:14:02 +05:30
|
|
|
docs = results["documents"]
|
2023-09-28 17:22:28 +02:00
|
|
|
assert len(docs) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Haystack" in docs[0].content
|
2023-12-21 15:45:31 +00:00
|
|
|
assert docs[0].meta["test"] == "TEST"
|
2023-09-28 17:22:28 +02:00
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_doc_metadata(self, test_files_path):
|
2023-11-22 02:14:02 +05:30
|
|
|
"""
|
|
|
|
Test if the component runs correctly when metadata is supplied by the user.
|
|
|
|
"""
|
|
|
|
converter = HTMLToDocument()
|
2023-11-24 14:48:43 +01:00
|
|
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
2023-11-22 02:14:02 +05:30
|
|
|
metadata = [{"file_name": "what_is_haystack.html"}]
|
|
|
|
results = converter.run(sources=sources, meta=metadata)
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
2023-12-15 16:41:35 +01:00
|
|
|
assert docs[0].meta["file_name"] == "what_is_haystack.html"
|
2023-11-22 02:14:02 +05:30
|
|
|
|
2024-11-22 13:55:08 +01:00
|
|
|
def test_run_with_store_full_path(self, test_files_path):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly when metadata is supplied by the user.
|
|
|
|
"""
|
2024-12-10 16:03:38 +01:00
|
|
|
converter = HTMLToDocument(store_full_path=True)
|
2024-11-22 13:55:08 +01:00
|
|
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
|
|
|
|
|
|
|
results = converter.run(sources=sources) # store_full_path is True by default
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
|
|
|
assert docs[0].meta["file_path"] == str(sources[0])
|
|
|
|
|
|
|
|
converter_2 = HTMLToDocument(store_full_path=False)
|
|
|
|
results = converter_2.run(sources=sources)
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
|
|
|
assert docs[0].meta["file_path"] == "what_is_haystack.html"
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_incorrect_meta(self, test_files_path):
|
2023-11-22 02:14:02 +05:30
|
|
|
"""
|
|
|
|
Test if the component raises an error when incorrect metadata is supplied by the user.
|
|
|
|
"""
|
|
|
|
converter = HTMLToDocument()
|
2023-11-24 14:48:43 +01:00
|
|
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
2023-11-22 02:14:02 +05:30
|
|
|
metadata = [{"file_name": "what_is_haystack.html"}, {"file_name": "haystack.html"}]
|
|
|
|
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
|
|
|
|
converter.run(sources=sources, meta=metadata)
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_bytestream_metadata(self, test_files_path):
|
2023-11-22 02:14:02 +05:30
|
|
|
"""
|
|
|
|
Test if the component runs correctly when metadata is read from the ByteStream object.
|
|
|
|
"""
|
|
|
|
converter = HTMLToDocument()
|
2023-11-24 14:48:43 +01:00
|
|
|
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
|
2023-11-22 02:14:02 +05:30
|
|
|
byte_stream = file.read()
|
2023-12-21 17:09:58 +05:30
|
|
|
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"})
|
2023-11-22 02:14:02 +05:30
|
|
|
|
|
|
|
results = converter.run(sources=[stream])
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
|
|
|
assert docs[0].meta == {"content_type": "text/html", "url": "test_url"}
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_bytestream_and_doc_metadata(self, test_files_path):
|
2023-11-22 02:14:02 +05:30
|
|
|
"""
|
|
|
|
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
|
|
|
|
|
|
|
|
There is no overlap between the metadata received.
|
|
|
|
"""
|
|
|
|
converter = HTMLToDocument()
|
2023-11-24 14:48:43 +01:00
|
|
|
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
|
2023-11-22 02:14:02 +05:30
|
|
|
byte_stream = file.read()
|
2023-12-21 17:09:58 +05:30
|
|
|
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url"})
|
2023-11-22 02:14:02 +05:30
|
|
|
|
|
|
|
metadata = [{"file_name": "what_is_haystack.html"}]
|
|
|
|
results = converter.run(sources=[stream], meta=metadata)
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
|
|
|
assert docs[0].meta == {"file_name": "what_is_haystack.html", "content_type": "text/html", "url": "test_url"}
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_bytestream_doc_overlapping_metadata(self, test_files_path):
|
2023-11-22 02:14:02 +05:30
|
|
|
"""
|
|
|
|
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
|
|
|
|
|
|
|
|
There is an overlap between the metadata received.
|
|
|
|
|
|
|
|
The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
|
|
|
|
"""
|
|
|
|
converter = HTMLToDocument()
|
2023-11-24 14:48:43 +01:00
|
|
|
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as file:
|
2023-11-22 02:14:02 +05:30
|
|
|
byte_stream = file.read()
|
|
|
|
# ByteStream has "url" present in metadata
|
2023-12-21 17:09:58 +05:30
|
|
|
stream = ByteStream(byte_stream, meta={"content_type": "text/html", "url": "test_url_correct"})
|
2023-11-22 02:14:02 +05:30
|
|
|
|
|
|
|
# "url" supplied by the user overwrites value present in metadata
|
|
|
|
metadata = [{"file_name": "what_is_haystack.html", "url": "test_url_new"}]
|
|
|
|
results = converter.run(sources=[stream], meta=metadata)
|
|
|
|
docs = results["documents"]
|
|
|
|
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "Haystack" in docs[0].content
|
|
|
|
assert docs[0].meta == {
|
|
|
|
"file_name": "what_is_haystack.html",
|
|
|
|
"content_type": "text/html",
|
|
|
|
"url": "test_url_new",
|
|
|
|
}
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_wrong_file_type(self, test_files_path, caplog):
|
2023-09-28 17:22:28 +02:00
|
|
|
"""
|
|
|
|
Test if the component runs correctly when an input file is not of the expected type.
|
|
|
|
"""
|
2023-11-24 14:48:43 +01:00
|
|
|
sources = [test_files_path / "audio" / "answer.wav"]
|
2023-09-28 17:22:28 +02:00
|
|
|
converter = HTMLToDocument()
|
|
|
|
with caplog.at_level(logging.WARNING):
|
2023-11-22 02:14:02 +05:30
|
|
|
results = converter.run(sources=sources)
|
2024-04-05 16:02:34 +02:00
|
|
|
assert "Failed to extract text from" in caplog.text
|
2023-09-28 17:22:28 +02:00
|
|
|
|
2023-11-22 02:14:02 +05:30
|
|
|
assert results["documents"] == []
|
2023-09-28 17:22:28 +02:00
|
|
|
|
2023-11-22 02:14:02 +05:30
|
|
|
def test_run_error_handling(self, caplog):
|
2023-09-28 17:22:28 +02:00
|
|
|
"""
|
|
|
|
Test if the component correctly handles errors.
|
|
|
|
"""
|
2023-11-22 02:14:02 +05:30
|
|
|
sources = ["non_existing_file.html"]
|
2023-09-28 17:22:28 +02:00
|
|
|
converter = HTMLToDocument()
|
|
|
|
with caplog.at_level(logging.WARNING):
|
2023-11-22 02:14:02 +05:30
|
|
|
results = converter.run(sources=sources)
|
2023-10-11 10:15:58 +02:00
|
|
|
assert "Could not read non_existing_file.html" in caplog.text
|
2023-11-22 02:14:02 +05:30
|
|
|
assert results["documents"] == []
|
2023-10-11 10:15:58 +02:00
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_mixed_sources_run(self, test_files_path):
|
2023-10-11 10:15:58 +02:00
|
|
|
"""
|
2023-11-22 02:14:02 +05:30
|
|
|
Test if the component runs correctly if the input is a mix of paths and ByteStreams.
|
2023-10-11 10:15:58 +02:00
|
|
|
"""
|
2023-11-22 02:14:02 +05:30
|
|
|
sources = [
|
2023-11-24 14:48:43 +01:00
|
|
|
test_files_path / "html" / "what_is_haystack.html",
|
|
|
|
str((test_files_path / "html" / "what_is_haystack.html").absolute()),
|
2023-11-22 02:14:02 +05:30
|
|
|
]
|
2023-11-24 14:48:43 +01:00
|
|
|
with open(test_files_path / "html" / "what_is_haystack.html", "rb") as f:
|
2023-10-11 10:15:58 +02:00
|
|
|
byte_stream = f.read()
|
2023-11-22 02:14:02 +05:30
|
|
|
sources.append(ByteStream(byte_stream))
|
2023-10-11 10:15:58 +02:00
|
|
|
|
|
|
|
converter = HTMLToDocument()
|
2023-11-22 02:14:02 +05:30
|
|
|
results = converter.run(sources=sources)
|
|
|
|
docs = results["documents"]
|
|
|
|
assert len(docs) == 3
|
2023-10-11 10:15:58 +02:00
|
|
|
for doc in docs:
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Haystack" in doc.content
|
2024-01-18 10:02:01 +01:00
|
|
|
|
|
|
|
def test_serde(self):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly gets serialized and deserialized.
|
|
|
|
"""
|
2024-05-17 10:38:47 +02:00
|
|
|
converter = HTMLToDocument()
|
2024-01-18 10:02:01 +01:00
|
|
|
serde_data = converter.to_dict()
|
|
|
|
new_converter = HTMLToDocument.from_dict(serde_data)
|
2024-05-17 10:38:47 +02:00
|
|
|
assert new_converter.extraction_kwargs == converter.extraction_kwargs
|
2024-04-05 16:02:34 +02:00
|
|
|
|
2024-05-17 10:38:47 +02:00
|
|
|
def test_run_difficult_html(self, test_files_path):
|
2024-04-05 16:02:34 +02:00
|
|
|
converter = HTMLToDocument()
|
|
|
|
result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])
|
|
|
|
|
|
|
|
assert len(result["documents"]) == 1
|
|
|
|
assert "Superlinear" in result["documents"][0].content
|
2024-05-17 10:38:47 +02:00
|
|
|
|
2025-03-04 12:06:07 +01:00
|
|
|
@patch("haystack.components.converters.html.extract", return_value="test")
|
2024-07-31 10:59:53 +02:00
|
|
|
def test_run_with_extraction_kwargs(self, mock_extract, test_files_path):
|
2024-05-17 10:38:47 +02:00
|
|
|
sources = [test_files_path / "html" / "what_is_haystack.html"]
|
|
|
|
|
|
|
|
converter = HTMLToDocument()
|
2024-07-31 10:59:53 +02:00
|
|
|
converter.run(sources=sources)
|
|
|
|
assert mock_extract.call_count == 1
|
|
|
|
assert "favor_precision" not in mock_extract.call_args[1]
|
2024-05-17 10:38:47 +02:00
|
|
|
|
2024-07-31 10:59:53 +02:00
|
|
|
precise_converter = HTMLToDocument(extraction_kwargs={"favor_precision": True})
|
|
|
|
mock_extract.reset_mock()
|
|
|
|
precise_converter.run(sources=sources)
|
|
|
|
assert mock_extract.call_count == 1
|
|
|
|
assert mock_extract.call_args[1]["favor_precision"] is True
|