haystack/test/preview/components/file_converters/test_tika_doc_converter.py
Silvano Cerza 7287657f0e
refactor: Rename Document's text field to content (#6181)
* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2023-10-31 12:44:04 +01:00

76 lines
3.6 KiB
Python

from unittest.mock import patch
import pytest
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
class TestTikaDocumentConverter:
@pytest.mark.unit
def test_run(self):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.return_value = {"content": "Content of mock_file.pdf"}
documents = component.run(paths=["mock_file.pdf"])["documents"]
assert len(documents) == 1
assert documents[0].content == "Content of mock_file.pdf"
@pytest.mark.unit
def test_run_logs_warning_if_content_empty(self, caplog):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.return_value = {"content": ""}
with caplog.at_level("WARNING"):
component.run(paths=["mock_file.pdf"])
assert "Skipping file at 'mock_file.pdf' as Tika was not able to extract any content." in caplog.text
@pytest.mark.unit
def test_run_logs_error(self, caplog):
component = TikaDocumentConverter()
with patch("haystack.preview.components.file_converters.tika.tika_parser.from_file") as mock_tika_parser:
mock_tika_parser.side_effect = Exception("Some error")
with caplog.at_level("ERROR"):
component.run(paths=["mock_file.pdf"])
assert "Could not convert file at 'mock_file.pdf' to Document. Error: Some error" in caplog.text
@pytest.mark.integration
def test_run_with_txt_files(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(
paths=[preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
)
documents = output["documents"]
assert len(documents) == 2
assert "Some text for testing.\nTwo lines in here." in documents[0].content
assert "This is a test line.\n123 456 789\n987 654 321" in documents[1].content
@pytest.mark.integration
def test_run_with_pdf_file(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(
paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf", preview_samples_path / "pdf" / "sample_pdf_2.pdf"]
)
documents = output["documents"]
assert len(documents) == 2
assert "A sample PDF file" in documents[0].content
assert "Page 2 of Sample PDF" in documents[0].content
assert "Page 4 of Sample PDF" in documents[0].content
assert "First Page" in documents[1].content
assert (
"Wiki engines usually allow content to be written using a simplified markup language"
in documents[1].content
)
assert "This section needs additional citations for verification." in documents[1].content
assert "This would make it easier for other users to find the article." in documents[1].content
@pytest.mark.integration
def test_run_with_docx_file(self, preview_samples_path):
component = TikaDocumentConverter()
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
documents = output["documents"]
assert len(documents) == 1
assert "Sample Docx File" in documents[0].content
assert "Now we are in Page 2" in documents[0].content
assert "Page 3 was empty this is page 4" in documents[0].content