2023-09-21 11:52:26 +02:00
|
|
|
import logging
|
|
|
|
import pytest
|
2023-11-09 17:35:33 +01:00
|
|
|
from pypdf import PdfReader
|
2023-09-21 11:52:26 +02:00
|
|
|
|
2023-11-09 17:35:33 +01:00
|
|
|
from haystack.preview import Document
|
2023-11-23 10:28:40 +01:00
|
|
|
from haystack.preview.components.converters.pypdf import PyPDFToDocument
|
2023-10-11 10:52:08 +02:00
|
|
|
from haystack.preview.dataclasses import ByteStream
|
2023-09-21 11:52:26 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestPyPDFToDocument:
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_run(self, preview_samples_path):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly.
|
|
|
|
"""
|
|
|
|
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
|
|
|
converter = PyPDFToDocument()
|
2023-10-11 10:52:08 +02:00
|
|
|
output = converter.run(sources=paths)
|
2023-09-21 11:52:26 +02:00
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "ReAct" in docs[0].content
|
2023-09-21 11:52:26 +02:00
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_run_error_handling(self, preview_samples_path, caplog):
|
|
|
|
"""
|
|
|
|
Test if the component correctly handles errors.
|
|
|
|
"""
|
|
|
|
paths = ["non_existing_file.pdf"]
|
|
|
|
converter = PyPDFToDocument()
|
|
|
|
with caplog.at_level(logging.WARNING):
|
2023-10-11 10:52:08 +02:00
|
|
|
converter.run(sources=paths)
|
|
|
|
assert "Could not read non_existing_file.pdf" in caplog.text
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_mixed_sources_run(self, preview_samples_path):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
|
|
"""
|
|
|
|
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
|
|
|
with open(preview_samples_path / "pdf" / "react_paper.pdf", "rb") as f:
|
|
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
|
|
|
|
converter = PyPDFToDocument()
|
|
|
|
output = converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 2
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "ReAct" in docs[0].content
|
|
|
|
assert "ReAct" in docs[1].content
|
2023-11-09 17:35:33 +01:00
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_custom_converter(self, preview_samples_path):
|
|
|
|
"""
|
|
|
|
Test if the component correctly handles custom converters.
|
|
|
|
"""
|
|
|
|
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
|
|
|
|
|
|
|
class MyCustomConverter:
|
|
|
|
def convert(self, reader: PdfReader) -> Document:
|
|
|
|
return Document(content="I don't care about converting given pdfs, I always return this")
|
|
|
|
|
|
|
|
converter = PyPDFToDocument(converter=MyCustomConverter())
|
|
|
|
output = converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "ReAct" not in docs[0].content
|
|
|
|
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|