mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-22 14:43:07 +00:00

* Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
48 lines
1.6 KiB
Python
48 lines
1.6 KiB
Python
import logging
|
|
|
|
import pytest
|
|
|
|
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
|
|
from haystack.preview.dataclasses import ByteStream
|
|
|
|
|
|
class TestPyPDFToDocument:
|
|
@pytest.mark.unit
|
|
def test_run(self, preview_samples_path):
|
|
"""
|
|
Test if the component runs correctly.
|
|
"""
|
|
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
|
converter = PyPDFToDocument()
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "ReAct" in docs[0].content
|
|
|
|
@pytest.mark.unit
|
|
def test_run_error_handling(self, preview_samples_path, caplog):
|
|
"""
|
|
Test if the component correctly handles errors.
|
|
"""
|
|
paths = ["non_existing_file.pdf"]
|
|
converter = PyPDFToDocument()
|
|
with caplog.at_level(logging.WARNING):
|
|
converter.run(sources=paths)
|
|
assert "Could not read non_existing_file.pdf" in caplog.text
|
|
|
|
@pytest.mark.unit
|
|
def test_mixed_sources_run(self, preview_samples_path):
|
|
"""
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
"""
|
|
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
|
with open(preview_samples_path / "pdf" / "react_paper.pdf", "rb") as f:
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
converter = PyPDFToDocument()
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 2
|
|
assert "ReAct" in docs[0].content
|
|
assert "ReAct" in docs[1].content
|