haystack/test/preview/components/file_converters/test_pypdf_to_document.py
Julian Risch 9f3b6512be
refactor: Remove reimplementations of default from_dict/to_dict and corresponding tests in 2.0 (#6108)
* whisper transcriber

* remove from/to_dict from builders

* remove from/to_dict from embedders

* remove from/to_dict from fetcher, file_converters

* remove from/to_dict from generators, preprocessors

* remove from/to_dict from ranker, reader

* remove from/to_dict from router, sampler, websearch

* pylint

* reno

* refactor import

* remove unused import
2023-10-19 11:17:02 +02:00

48 lines
1.6 KiB
Python

import logging
import pytest
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
from haystack.preview.dataclasses import ByteStream
class TestPyPDFToDocument:
@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
converter = PyPDFToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" in docs[0].text
@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.pdf"]
converter = PyPDFToDocument()
with caplog.at_level(logging.WARNING):
converter.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text
@pytest.mark.unit
def test_mixed_sources_run(self, preview_samples_path):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
with open(preview_samples_path / "pdf" / "react_paper.pdf", "rb") as f:
paths.append(ByteStream(f.read()))
converter = PyPDFToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "ReAct" in docs[0].text
assert "ReAct" in docs[1].text