2024-05-09 15:40:36 +02:00
|
|
|
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
|
|
#
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2023-09-21 11:52:26 +02:00
|
|
|
import logging
|
2023-12-15 16:41:35 +01:00
|
|
|
from unittest.mock import patch
|
2024-03-26 10:09:29 +01:00
|
|
|
|
2023-09-21 11:52:26 +02:00
|
|
|
import pytest
|
|
|
|
|
2024-03-26 10:09:29 +01:00
|
|
|
from haystack import Document, default_from_dict, default_to_dict
|
|
|
|
from haystack.components.converters.pypdf import CONVERTERS_REGISTRY, DefaultConverter, PyPDFToDocument
|
2023-11-24 14:48:43 +01:00
|
|
|
from haystack.dataclasses import ByteStream
|
2023-09-21 11:52:26 +02:00
|
|
|
|
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
@pytest.fixture
|
|
|
|
def pypdf_converter():
|
|
|
|
return PyPDFToDocument()
|
|
|
|
|
|
|
|
|
2023-09-21 11:52:26 +02:00
|
|
|
class TestPyPDFToDocument:
|
2024-01-18 08:54:59 +01:00
|
|
|
def test_init(self, pypdf_converter):
|
2024-03-26 10:09:29 +01:00
|
|
|
assert isinstance(pypdf_converter.converter, DefaultConverter)
|
|
|
|
assert pypdf_converter.converter_name is None
|
2023-11-23 15:37:20 +01:00
|
|
|
|
|
|
|
def test_init_fail_nonexisting_converter(self):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
PyPDFToDocument(converter_name="non_existing_converter")
|
|
|
|
|
2024-03-26 10:09:29 +01:00
|
|
|
def test_to_dict(self, pypdf_converter):
|
|
|
|
data = pypdf_converter.to_dict()
|
|
|
|
assert data == {
|
|
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
|
|
"init_parameters": {
|
|
|
|
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}},
|
|
|
|
"converter_name": None,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
def test_from_dict(self):
|
|
|
|
data = {
|
|
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
|
|
"init_parameters": {
|
|
|
|
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
instance = PyPDFToDocument.from_dict(data)
|
|
|
|
assert isinstance(instance, PyPDFToDocument)
|
|
|
|
assert isinstance(instance.converter, DefaultConverter)
|
|
|
|
assert instance.converter_name is None
|
|
|
|
|
|
|
|
def test_from_dict_with_converter_name(self):
|
|
|
|
data = {
|
|
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
|
|
"init_parameters": {"converter_name": "default"},
|
|
|
|
}
|
|
|
|
|
|
|
|
instance = PyPDFToDocument.from_dict(data)
|
|
|
|
assert isinstance(instance, PyPDFToDocument)
|
|
|
|
assert isinstance(instance.converter, DefaultConverter)
|
|
|
|
assert instance.converter_name == "default"
|
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
@pytest.mark.integration
|
|
|
|
def test_run(self, test_files_path, pypdf_converter):
|
2023-09-21 11:52:26 +02:00
|
|
|
"""
|
|
|
|
Test if the component runs correctly.
|
|
|
|
"""
|
2024-01-18 08:54:59 +01:00
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
output = pypdf_converter.run(sources=paths)
|
2023-09-21 11:52:26 +02:00
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
2024-01-18 08:54:59 +01:00
|
|
|
assert "History" in docs[0].content
|
2023-09-21 11:52:26 +02:00
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
@pytest.mark.integration
|
|
|
|
def test_page_breaks_added(self, test_files_path, pypdf_converter):
|
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
output = pypdf_converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert docs[0].content.count("\f") == 3
|
|
|
|
|
|
|
|
def test_run_with_meta(self, test_files_path, pypdf_converter):
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
with patch("haystack.components.converters.pypdf.PdfReader"):
|
2024-01-18 08:54:59 +01:00
|
|
|
output = pypdf_converter.run(
|
|
|
|
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
2023-12-22 13:13:11 +00:00
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
2023-12-22 13:13:11 +00:00
|
|
|
assert output["documents"][0].meta["author"] == "test_author"
|
|
|
|
assert output["documents"][0].meta["language"] == "it"
|
|
|
|
assert output["documents"][1].meta["language"] == "it"
|
2023-12-15 16:41:35 +01:00
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
|
2023-09-21 11:52:26 +02:00
|
|
|
"""
|
|
|
|
Test if the component correctly handles errors.
|
|
|
|
"""
|
|
|
|
paths = ["non_existing_file.pdf"]
|
|
|
|
with caplog.at_level(logging.WARNING):
|
2024-01-18 08:54:59 +01:00
|
|
|
pypdf_converter.run(sources=paths)
|
2023-10-11 10:52:08 +02:00
|
|
|
assert "Could not read non_existing_file.pdf" in caplog.text
|
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
@pytest.mark.integration
|
|
|
|
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
|
2023-10-11 10:52:08 +02:00
|
|
|
"""
|
|
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
|
|
"""
|
2024-01-18 08:54:59 +01:00
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
|
2023-10-11 10:52:08 +02:00
|
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
output = pypdf_converter.run(sources=paths)
|
2023-10-11 10:52:08 +02:00
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 2
|
2024-01-18 08:54:59 +01:00
|
|
|
assert "History and standardization" in docs[0].content
|
|
|
|
assert "History and standardization" in docs[1].content
|
2023-11-09 17:35:33 +01:00
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
@pytest.mark.integration
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_custom_converter(self, test_files_path):
|
2023-11-09 17:35:33 +01:00
|
|
|
"""
|
|
|
|
Test if the component correctly handles custom converters.
|
|
|
|
"""
|
2023-11-29 19:24:25 +01:00
|
|
|
from pypdf import PdfReader
|
|
|
|
|
2024-01-18 08:54:59 +01:00
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
2023-11-09 17:35:33 +01:00
|
|
|
|
2024-03-26 10:09:29 +01:00
|
|
|
class MyCustomConverter:
|
|
|
|
def convert(self, reader: PdfReader) -> Document:
|
|
|
|
return Document(content="I don't care about converting given pdfs, I always return this")
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
return default_to_dict(self)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_dict(cls, data):
|
|
|
|
return default_from_dict(cls, data)
|
|
|
|
|
|
|
|
component = PyPDFToDocument(converter=MyCustomConverter())
|
|
|
|
output = component.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "ReAct" not in docs[0].content
|
|
|
|
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|
|
|
|
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_custom_converter_deprecated(self, test_files_path):
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
|
2023-11-09 17:35:33 +01:00
|
|
|
class MyCustomConverter:
|
|
|
|
def convert(self, reader: PdfReader) -> Document:
|
|
|
|
return Document(content="I don't care about converting given pdfs, I always return this")
|
|
|
|
|
2023-11-23 15:37:20 +01:00
|
|
|
CONVERTERS_REGISTRY["custom"] = MyCustomConverter()
|
|
|
|
|
|
|
|
converter = PyPDFToDocument(converter_name="custom")
|
2023-11-09 17:35:33 +01:00
|
|
|
output = converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "ReAct" not in docs[0].content
|
|
|
|
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|