mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00

* deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting * pypdftodocument: add customization params * fix mypy * incorporate feedback
274 lines
11 KiB
Python
274 lines
11 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
import logging
|
|
from unittest.mock import patch, Mock
|
|
|
|
import pytest
|
|
|
|
from haystack import Document, default_from_dict, default_to_dict
|
|
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
|
|
@pytest.fixture
|
|
def pypdf_component():
|
|
return PyPDFToDocument()
|
|
|
|
|
|
class CustomConverter:
|
|
def convert(self, reader):
|
|
return Document(content="Custom converter")
|
|
|
|
def to_dict(self):
|
|
return {"key": "value", "more": False}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data):
|
|
assert data == {"key": "value", "more": False}
|
|
return cls()
|
|
|
|
|
|
class TestPyPDFToDocument:
|
|
def test_init(self, pypdf_component):
|
|
assert pypdf_component.converter is None
|
|
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
|
|
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
|
|
assert pypdf_component.plain_mode_space_width == 200.0
|
|
assert pypdf_component.layout_mode_space_vertically is True
|
|
assert pypdf_component.layout_mode_scale_weight == 1.25
|
|
assert pypdf_component.layout_mode_strip_rotated is True
|
|
assert pypdf_component.layout_mode_font_height_weight == 1.0
|
|
|
|
def test_init_converter(self):
|
|
pypdf_component = PyPDFToDocument(converter=CustomConverter())
|
|
assert isinstance(pypdf_component.converter, CustomConverter)
|
|
|
|
def test_init_custom_params(self):
|
|
pypdf_component = PyPDFToDocument(
|
|
extraction_mode="layout",
|
|
plain_mode_orientations=(0, 90),
|
|
plain_mode_space_width=150.0,
|
|
layout_mode_space_vertically=False,
|
|
layout_mode_scale_weight=2.0,
|
|
layout_mode_strip_rotated=False,
|
|
layout_mode_font_height_weight=0.5,
|
|
)
|
|
|
|
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
|
|
assert pypdf_component.plain_mode_orientations == (0, 90)
|
|
assert pypdf_component.plain_mode_space_width == 150.0
|
|
assert pypdf_component.layout_mode_space_vertically is False
|
|
assert pypdf_component.layout_mode_scale_weight == 2.0
|
|
assert pypdf_component.layout_mode_strip_rotated is False
|
|
assert pypdf_component.layout_mode_font_height_weight == 0.5
|
|
|
|
def test_init_invalid_extraction_mode(self):
|
|
with pytest.raises(ValueError):
|
|
PyPDFToDocument(extraction_mode="invalid")
|
|
|
|
def test_to_dict(self, pypdf_component):
|
|
data = pypdf_component.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
"init_parameters": {
|
|
"converter": None,
|
|
"extraction_mode": "plain",
|
|
"plain_mode_orientations": (0, 90, 180, 270),
|
|
"plain_mode_space_width": 200.0,
|
|
"layout_mode_space_vertically": True,
|
|
"layout_mode_scale_weight": 1.25,
|
|
"layout_mode_strip_rotated": True,
|
|
"layout_mode_font_height_weight": 1.0,
|
|
},
|
|
}
|
|
|
|
def test_to_dict_custom_converter(self):
|
|
pypdf_component = PyPDFToDocument(converter=CustomConverter())
|
|
data = pypdf_component.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
"init_parameters": {
|
|
"converter": {
|
|
"data": {"key": "value", "more": False},
|
|
"type": "converters.test_pypdf_to_document.CustomConverter",
|
|
},
|
|
"extraction_mode": "plain",
|
|
"plain_mode_orientations": (0, 90, 180, 270),
|
|
"plain_mode_space_width": 200.0,
|
|
"layout_mode_space_vertically": True,
|
|
"layout_mode_scale_weight": 1.25,
|
|
"layout_mode_strip_rotated": True,
|
|
"layout_mode_font_height_weight": 1.0,
|
|
},
|
|
}
|
|
|
|
def test_from_dict(self):
|
|
data = {
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
"init_parameters": {
|
|
"converter": None,
|
|
"extraction_mode": "plain",
|
|
"plain_mode_orientations": (0, 90, 180, 270),
|
|
"plain_mode_space_width": 200.0,
|
|
"layout_mode_space_vertically": True,
|
|
"layout_mode_scale_weight": 1.25,
|
|
"layout_mode_strip_rotated": True,
|
|
"layout_mode_font_height_weight": 1.0,
|
|
},
|
|
}
|
|
|
|
instance = PyPDFToDocument.from_dict(data)
|
|
assert isinstance(instance, PyPDFToDocument)
|
|
assert instance.converter is None
|
|
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
|
|
assert instance.plain_mode_orientations == (0, 90, 180, 270)
|
|
assert instance.plain_mode_space_width == 200.0
|
|
assert instance.layout_mode_space_vertically is True
|
|
assert instance.layout_mode_scale_weight == 1.25
|
|
assert instance.layout_mode_strip_rotated is True
|
|
assert instance.layout_mode_font_height_weight == 1.0
|
|
|
|
def test_from_dict_defaults(self):
|
|
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
|
|
instance = PyPDFToDocument.from_dict(data)
|
|
assert isinstance(instance, PyPDFToDocument)
|
|
assert instance.converter is None
|
|
|
|
def test_from_dict_custom_converter(self):
|
|
data = {
|
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
|
"init_parameters": {
|
|
"converter": {
|
|
"data": {"key": "value", "more": False},
|
|
"type": "converters.test_pypdf_to_document.CustomConverter",
|
|
}
|
|
},
|
|
}
|
|
instance = PyPDFToDocument.from_dict(data)
|
|
assert isinstance(instance, PyPDFToDocument)
|
|
assert isinstance(instance.converter, CustomConverter)
|
|
|
|
def test_default_convert(self):
|
|
mock_page1 = Mock()
|
|
mock_page2 = Mock()
|
|
mock_page1.extract_text.return_value = "Page 1 content"
|
|
mock_page2.extract_text.return_value = "Page 2 content"
|
|
mock_reader = Mock()
|
|
mock_reader.pages = [mock_page1, mock_page2]
|
|
|
|
converter = PyPDFToDocument(
|
|
extraction_mode="layout",
|
|
plain_mode_orientations=(0, 90),
|
|
plain_mode_space_width=150.0,
|
|
layout_mode_space_vertically=False,
|
|
layout_mode_scale_weight=2.0,
|
|
layout_mode_strip_rotated=False,
|
|
layout_mode_font_height_weight=1.5,
|
|
)
|
|
|
|
doc = converter._default_convert(mock_reader)
|
|
assert doc.content == "Page 1 content\fPage 2 content"
|
|
|
|
expected_params = {
|
|
"extraction_mode": "layout",
|
|
"orientations": (0, 90),
|
|
"space_width": 150.0,
|
|
"layout_mode_space_vertically": False,
|
|
"layout_mode_scale_weight": 2.0,
|
|
"layout_mode_strip_rotated": False,
|
|
"layout_mode_font_height_weight": 1.5,
|
|
}
|
|
for mock_page in mock_reader.pages:
|
|
mock_page.extract_text.assert_called_once_with(**expected_params)
|
|
|
|
@pytest.mark.integration
|
|
def test_run(self, test_files_path, pypdf_component):
|
|
"""
|
|
Test if the component runs correctly.
|
|
"""
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
output = pypdf_component.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "History" in docs[0].content
|
|
|
|
@pytest.mark.integration
|
|
def test_page_breaks_added(self, test_files_path, pypdf_component):
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
output = pypdf_component.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert docs[0].content.count("\f") == 3
|
|
|
|
def test_run_with_meta(self, test_files_path, pypdf_component):
|
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
|
|
|
with patch("haystack.components.converters.pypdf.PdfReader"):
|
|
output = pypdf_component.run(
|
|
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
|
)
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
|
assert output["documents"][0].meta["author"] == "test_author"
|
|
assert output["documents"][0].meta["language"] == "it"
|
|
assert output["documents"][1].meta["language"] == "it"
|
|
|
|
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
|
|
"""
|
|
Test if the component correctly handles errors.
|
|
"""
|
|
paths = ["non_existing_file.pdf"]
|
|
with caplog.at_level(logging.WARNING):
|
|
pypdf_component.run(sources=paths)
|
|
assert "Could not read non_existing_file.pdf" in caplog.text
|
|
|
|
@pytest.mark.integration
|
|
def test_mixed_sources_run(self, test_files_path, pypdf_component):
|
|
"""
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
"""
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
output = pypdf_component.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 2
|
|
assert "History and standardization" in docs[0].content
|
|
assert "History and standardization" in docs[1].content
|
|
|
|
@pytest.mark.integration
|
|
def test_custom_converter(self, test_files_path):
|
|
"""
|
|
Test if the component correctly handles custom converters.
|
|
"""
|
|
from pypdf import PdfReader
|
|
|
|
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
class MyCustomConverter:
|
|
def convert(self, reader: PdfReader) -> Document:
|
|
return Document(content="I don't care about converting given pdfs, I always return this")
|
|
|
|
def to_dict(self):
|
|
return default_to_dict(self)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data):
|
|
return default_from_dict(cls, data)
|
|
|
|
component = PyPDFToDocument(converter=MyCustomConverter())
|
|
output = component.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "ReAct" not in docs[0].content
|
|
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|
|
|
|
def test_run_empty_document(self, caplog, test_files_path):
|
|
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
|
|
with caplog.at_level(logging.WARNING):
|
|
output = PyPDFToDocument().run(sources=paths)
|
|
assert "PyPDFToDocument could not extract text from the file" in caplog.text
|
|
assert output["documents"][0].content == ""
|