# SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 import logging from unittest.mock import patch, Mock import pytest from haystack import Document, default_from_dict, default_to_dict from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode from haystack.dataclasses import ByteStream @pytest.fixture def pypdf_component(): return PyPDFToDocument() class CustomConverter: def convert(self, reader): return Document(content="Custom converter") def to_dict(self): return {"key": "value", "more": False} @classmethod def from_dict(cls, data): assert data == {"key": "value", "more": False} return cls() class TestPyPDFToDocument: def test_init(self, pypdf_component): assert pypdf_component.converter is None assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270) assert pypdf_component.plain_mode_space_width == 200.0 assert pypdf_component.layout_mode_space_vertically is True assert pypdf_component.layout_mode_scale_weight == 1.25 assert pypdf_component.layout_mode_strip_rotated is True assert pypdf_component.layout_mode_font_height_weight == 1.0 def test_init_converter(self): pypdf_component = PyPDFToDocument(converter=CustomConverter()) assert isinstance(pypdf_component.converter, CustomConverter) def test_init_custom_params(self): pypdf_component = PyPDFToDocument( extraction_mode="layout", plain_mode_orientations=(0, 90), plain_mode_space_width=150.0, layout_mode_space_vertically=False, layout_mode_scale_weight=2.0, layout_mode_strip_rotated=False, layout_mode_font_height_weight=0.5, ) assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT assert pypdf_component.plain_mode_orientations == (0, 90) assert pypdf_component.plain_mode_space_width == 150.0 assert pypdf_component.layout_mode_space_vertically is False assert pypdf_component.layout_mode_scale_weight == 2.0 assert pypdf_component.layout_mode_strip_rotated is False assert pypdf_component.layout_mode_font_height_weight == 0.5 def test_init_invalid_extraction_mode(self): with pytest.raises(ValueError): PyPDFToDocument(extraction_mode="invalid") def test_to_dict(self, pypdf_component): data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": None, "extraction_mode": "plain", "plain_mode_orientations": (0, 90, 180, 270), "plain_mode_space_width": 200.0, "layout_mode_space_vertically": True, "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, }, } def test_to_dict_custom_converter(self): pypdf_component = PyPDFToDocument(converter=CustomConverter()) data = pypdf_component.to_dict() assert data == { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": { "data": {"key": "value", "more": False}, "type": "converters.test_pypdf_to_document.CustomConverter", }, "extraction_mode": "plain", "plain_mode_orientations": (0, 90, 180, 270), "plain_mode_space_width": 200.0, "layout_mode_space_vertically": True, "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, }, } def test_from_dict(self): data = { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": None, "extraction_mode": "plain", "plain_mode_orientations": (0, 90, 180, 270), "plain_mode_space_width": 200.0, "layout_mode_space_vertically": True, "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, }, } instance = PyPDFToDocument.from_dict(data) assert isinstance(instance, PyPDFToDocument) assert instance.converter is None assert instance.extraction_mode == PyPDFExtractionMode.PLAIN assert instance.plain_mode_orientations == (0, 90, 180, 270) assert instance.plain_mode_space_width == 200.0 assert instance.layout_mode_space_vertically is True assert instance.layout_mode_scale_weight == 1.25 assert instance.layout_mode_strip_rotated is True assert instance.layout_mode_font_height_weight == 1.0 def test_from_dict_defaults(self): data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}} instance = PyPDFToDocument.from_dict(data) assert isinstance(instance, PyPDFToDocument) assert instance.converter is None def test_from_dict_custom_converter(self): data = { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { "converter": { "data": {"key": "value", "more": False}, "type": "converters.test_pypdf_to_document.CustomConverter", } }, } instance = PyPDFToDocument.from_dict(data) assert isinstance(instance, PyPDFToDocument) assert isinstance(instance.converter, CustomConverter) def test_default_convert(self): mock_page1 = Mock() mock_page2 = Mock() mock_page1.extract_text.return_value = "Page 1 content" mock_page2.extract_text.return_value = "Page 2 content" mock_reader = Mock() mock_reader.pages = [mock_page1, mock_page2] converter = PyPDFToDocument( extraction_mode="layout", plain_mode_orientations=(0, 90), plain_mode_space_width=150.0, layout_mode_space_vertically=False, layout_mode_scale_weight=2.0, layout_mode_strip_rotated=False, layout_mode_font_height_weight=1.5, ) doc = converter._default_convert(mock_reader) assert doc.content == "Page 1 content\fPage 2 content" expected_params = { "extraction_mode": "layout", "orientations": (0, 90), "space_width": 150.0, "layout_mode_space_vertically": False, "layout_mode_scale_weight": 2.0, "layout_mode_strip_rotated": False, "layout_mode_font_height_weight": 1.5, } for mock_page in mock_reader.pages: mock_page.extract_text.assert_called_once_with(**expected_params) @pytest.mark.integration def test_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly. """ paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert "History" in docs[0].content @pytest.mark.integration def test_page_breaks_added(self, test_files_path, pypdf_component): paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert docs[0].content.count("\f") == 3 def test_run_with_meta(self, test_files_path, pypdf_component): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) with patch("haystack.components.converters.pypdf.PdfReader"): output = pypdf_component.run( sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"} ) # check that the metadata from the bytestream is merged with that from the meta parameter assert output["documents"][0].meta["author"] == "test_author" assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" def test_run_error_handling(self, test_files_path, pypdf_component, caplog): """ Test if the component correctly handles errors. """ paths = ["non_existing_file.pdf"] with caplog.at_level(logging.WARNING): pypdf_component.run(sources=paths) assert "Could not read non_existing_file.pdf" in caplog.text @pytest.mark.integration def test_mixed_sources_run(self, test_files_path, pypdf_component): """ Test if the component runs correctly when mixed sources are provided. """ paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f: paths.append(ByteStream(f.read())) output = pypdf_component.run(sources=paths) docs = output["documents"] assert len(docs) == 2 assert "History and standardization" in docs[0].content assert "History and standardization" in docs[1].content @pytest.mark.integration def test_custom_converter(self, test_files_path): """ Test if the component correctly handles custom converters. """ from pypdf import PdfReader paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] class MyCustomConverter: def convert(self, reader: PdfReader) -> Document: return Document(content="I don't care about converting given pdfs, I always return this") def to_dict(self): return default_to_dict(self) @classmethod def from_dict(cls, data): return default_from_dict(cls, data) component = PyPDFToDocument(converter=MyCustomConverter()) output = component.run(sources=paths) docs = output["documents"] assert len(docs) == 1 assert "ReAct" not in docs[0].content assert "I don't care about converting given pdfs, I always return this" in docs[0].content def test_run_empty_document(self, caplog, test_files_path): paths = [test_files_path / "pdf" / "non_text_searchable.pdf"] with caplog.at_level(logging.WARNING): output = PyPDFToDocument().run(sources=paths) assert "PyPDFToDocument could not extract text from the file" in caplog.text assert output["documents"][0].content == ""