haystack/test/components/converters/test_pypdf_to_document.py
Stefano Fiorucci fb42c035c5
feat: PyPDFToDocument - add new customization parameters (#8574)
* deprecat converter in pypdf

* fix linting of MetaFieldGroupingRanker

* linting

* pypdftodocument: add customization params

* fix mypy

* incorporate feedback
2024-11-26 16:37:59 +01:00

274 lines
11 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch, Mock
import pytest
from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
from haystack.dataclasses import ByteStream
@pytest.fixture
def pypdf_component():
return PyPDFToDocument()
class CustomConverter:
def convert(self, reader):
return Document(content="Custom converter")
def to_dict(self):
return {"key": "value", "more": False}
@classmethod
def from_dict(cls, data):
assert data == {"key": "value", "more": False}
return cls()
class TestPyPDFToDocument:
def test_init(self, pypdf_component):
assert pypdf_component.converter is None
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
assert pypdf_component.plain_mode_space_width == 200.0
assert pypdf_component.layout_mode_space_vertically is True
assert pypdf_component.layout_mode_scale_weight == 1.25
assert pypdf_component.layout_mode_strip_rotated is True
assert pypdf_component.layout_mode_font_height_weight == 1.0
def test_init_converter(self):
pypdf_component = PyPDFToDocument(converter=CustomConverter())
assert isinstance(pypdf_component.converter, CustomConverter)
def test_init_custom_params(self):
pypdf_component = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=0.5,
)
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
assert pypdf_component.plain_mode_orientations == (0, 90)
assert pypdf_component.plain_mode_space_width == 150.0
assert pypdf_component.layout_mode_space_vertically is False
assert pypdf_component.layout_mode_scale_weight == 2.0
assert pypdf_component.layout_mode_strip_rotated is False
assert pypdf_component.layout_mode_font_height_weight == 0.5
def test_init_invalid_extraction_mode(self):
with pytest.raises(ValueError):
PyPDFToDocument(extraction_mode="invalid")
def test_to_dict(self, pypdf_component):
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": None,
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}
def test_to_dict_custom_converter(self):
pypdf_component = PyPDFToDocument(converter=CustomConverter())
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {
"data": {"key": "value", "more": False},
"type": "converters.test_pypdf_to_document.CustomConverter",
},
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}
def test_from_dict(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": None,
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
assert instance.plain_mode_orientations == (0, 90, 180, 270)
assert instance.plain_mode_space_width == 200.0
assert instance.layout_mode_space_vertically is True
assert instance.layout_mode_scale_weight == 1.25
assert instance.layout_mode_strip_rotated is True
assert instance.layout_mode_font_height_weight == 1.0
def test_from_dict_defaults(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None
def test_from_dict_custom_converter(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {
"data": {"key": "value", "more": False},
"type": "converters.test_pypdf_to_document.CustomConverter",
}
},
}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, CustomConverter)
def test_default_convert(self):
mock_page1 = Mock()
mock_page2 = Mock()
mock_page1.extract_text.return_value = "Page 1 content"
mock_page2.extract_text.return_value = "Page 2 content"
mock_reader = Mock()
mock_reader.pages = [mock_page1, mock_page2]
converter = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=1.5,
)
doc = converter._default_convert(mock_reader)
assert doc.content == "Page 1 content\fPage 2 content"
expected_params = {
"extraction_mode": "layout",
"orientations": (0, 90),
"space_width": 150.0,
"layout_mode_space_vertically": False,
"layout_mode_scale_weight": 2.0,
"layout_mode_strip_rotated": False,
"layout_mode_font_height_weight": 1.5,
}
for mock_page in mock_reader.pages:
mock_page.extract_text.assert_called_once_with(**expected_params)
@pytest.mark.integration
def test_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
@pytest.mark.integration
def test_page_breaks_added(self, test_files_path, pypdf_component):
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3
def test_run_with_meta(self, test_files_path, pypdf_component):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
with patch("haystack.components.converters.pypdf.PdfReader"):
output = pypdf_component.run(
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
)
# check that the metadata from the bytestream is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.pdf"]
with caplog.at_level(logging.WARNING):
pypdf_component.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text
@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
paths.append(ByteStream(f.read()))
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
@pytest.mark.integration
def test_custom_converter(self, test_files_path):
"""
Test if the component correctly handles custom converters.
"""
from pypdf import PdfReader
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document:
return Document(content="I don't care about converting given pdfs, I always return this")
def to_dict(self):
return default_to_dict(self)
@classmethod
def from_dict(cls, data):
return default_from_dict(cls, data)
component = PyPDFToDocument(converter=MyCustomConverter())
output = component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
def test_run_empty_document(self, caplog, test_files_path):
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
with caplog.at_level(logging.WARNING):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""