haystack/test/components/converters/test_markdown_to_document.py

91 lines
3.4 KiB
Python
Raw Normal View History

import logging
from unittest.mock import patch
import pytest
2023-11-24 14:48:43 +01:00
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.dataclasses import ByteStream
2023-11-29 19:24:25 +01:00
@pytest.mark.integration
class TestMarkdownToDocument:
def test_init_params_default(self):
converter = MarkdownToDocument()
assert converter.table_to_single_line is False
assert converter.progress_bar is True
def test_init_params_custom(self):
converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False)
assert converter.table_to_single_line is True
assert converter.progress_bar is False
@pytest.mark.integration
2023-11-24 14:48:43 +01:00
def test_run(self, test_files_path):
converter = MarkdownToDocument()
2023-11-24 14:48:43 +01:00
sources = [test_files_path / "markdown" / "sample.md"]
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
for doc in docs:
assert "What to build with Haystack" in doc.content
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
def test_run_with_meta(self):
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
converter = MarkdownToDocument()
with patch("haystack.components.converters.markdown.MarkdownIt"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]
# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}
@pytest.mark.integration
2023-11-24 14:48:43 +01:00
def test_run_wrong_file_type(self, test_files_path, caplog):
"""
Test if the component runs correctly when an input file is not of the expected type.
"""
2023-11-24 14:48:43 +01:00
sources = [test_files_path / "audio" / "answer.wav"]
converter = MarkdownToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=sources)
assert "codec can't decode byte" in caplog.text
docs = output["documents"]
assert not docs
@pytest.mark.integration
def test_run_error_handling(self, caplog):
"""
Test if the component correctly handles errors.
"""
sources = ["non_existing_file.md"]
converter = MarkdownToDocument()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)
assert "Could not read non_existing_file.md" in caplog.text
assert not result["documents"]
2023-11-24 14:48:43 +01:00
def test_mixed_sources_run(self, test_files_path):
"""
Test if the component runs correctly if the input is a mix of strings, paths and ByteStreams.
"""
sources = [
2023-11-24 14:48:43 +01:00
test_files_path / "markdown" / "sample.md",
str((test_files_path / "markdown" / "sample.md").absolute()),
]
2023-11-24 14:48:43 +01:00
with open(test_files_path / "markdown" / "sample.md", "rb") as f:
byte_stream = f.read()
sources.append(ByteStream(byte_stream))
converter = MarkdownToDocument()
output = converter.run(sources=sources)
docs = output["documents"]
assert len(docs) == 3
for doc in docs:
assert "What to build with Haystack" in doc.content
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content