2023-09-25 11:47:21 +02:00
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-11-23 10:28:40 +01:00
|
|
|
from haystack.preview.components.converters.tika import TikaDocumentConverter
|
2023-09-25 11:47:21 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestTikaDocumentConverter:
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_run(self):
|
|
|
|
component = TikaDocumentConverter()
|
2023-11-23 10:28:40 +01:00
|
|
|
with patch("haystack.preview.components.converters.tika.tika_parser.from_file") as mock_tika_parser:
|
2023-09-25 11:47:21 +02:00
|
|
|
mock_tika_parser.return_value = {"content": "Content of mock_file.pdf"}
|
|
|
|
documents = component.run(paths=["mock_file.pdf"])["documents"]
|
|
|
|
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert documents[0].content == "Content of mock_file.pdf"
|
2023-09-25 11:47:21 +02:00
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_run_logs_warning_if_content_empty(self, caplog):
|
|
|
|
component = TikaDocumentConverter()
|
2023-11-23 10:28:40 +01:00
|
|
|
with patch("haystack.preview.components.converters.tika.tika_parser.from_file") as mock_tika_parser:
|
2023-09-25 11:47:21 +02:00
|
|
|
mock_tika_parser.return_value = {"content": ""}
|
|
|
|
with caplog.at_level("WARNING"):
|
|
|
|
component.run(paths=["mock_file.pdf"])
|
|
|
|
assert "Skipping file at 'mock_file.pdf' as Tika was not able to extract any content." in caplog.text
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_run_logs_error(self, caplog):
|
|
|
|
component = TikaDocumentConverter()
|
2023-11-23 10:28:40 +01:00
|
|
|
with patch("haystack.preview.components.converters.tika.tika_parser.from_file") as mock_tika_parser:
|
2023-09-25 11:47:21 +02:00
|
|
|
mock_tika_parser.side_effect = Exception("Some error")
|
|
|
|
with caplog.at_level("ERROR"):
|
|
|
|
component.run(paths=["mock_file.pdf"])
|
|
|
|
assert "Could not convert file at 'mock_file.pdf' to Document. Error: Some error" in caplog.text
|
|
|
|
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_run_with_txt_files(self, preview_samples_path):
|
|
|
|
component = TikaDocumentConverter()
|
|
|
|
output = component.run(
|
|
|
|
paths=[preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
|
|
|
|
)
|
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 2
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Some text for testing.\nTwo lines in here." in documents[0].content
|
|
|
|
assert "This is a test line.\n123 456 789\n987 654 321" in documents[1].content
|
2023-09-25 11:47:21 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_run_with_pdf_file(self, preview_samples_path):
|
|
|
|
component = TikaDocumentConverter()
|
|
|
|
output = component.run(
|
|
|
|
paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf", preview_samples_path / "pdf" / "sample_pdf_2.pdf"]
|
|
|
|
)
|
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 2
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "A sample PDF file" in documents[0].content
|
|
|
|
assert "Page 2 of Sample PDF" in documents[0].content
|
|
|
|
assert "Page 4 of Sample PDF" in documents[0].content
|
|
|
|
assert "First Page" in documents[1].content
|
2023-09-25 11:47:21 +02:00
|
|
|
assert (
|
2023-10-31 12:44:04 +01:00
|
|
|
"Wiki engines usually allow content to be written using a simplified markup language"
|
|
|
|
in documents[1].content
|
2023-09-25 11:47:21 +02:00
|
|
|
)
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "This section needs additional citations for verification." in documents[1].content
|
|
|
|
assert "This would make it easier for other users to find the article." in documents[1].content
|
2023-09-25 11:47:21 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_run_with_docx_file(self, preview_samples_path):
|
|
|
|
component = TikaDocumentConverter()
|
|
|
|
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
|
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Sample Docx File" in documents[0].content
|
|
|
|
assert "Now we are in Page 2" in documents[0].content
|
|
|
|
assert "Page 3 was empty this is page 4" in documents[0].content
|