mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
239 lines
10 KiB
Python
239 lines
10 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||
#
|
||
# SPDX-License-Identifier: Apache-2.0
|
||
|
||
import logging
|
||
from unittest.mock import patch, Mock
|
||
|
||
import pytest
|
||
|
||
from haystack import Document
|
||
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
|
||
from haystack.components.preprocessors import DocumentSplitter
|
||
from haystack.dataclasses import ByteStream
|
||
|
||
|
||
@pytest.fixture
|
||
def pypdf_component():
|
||
return PyPDFToDocument()
|
||
|
||
|
||
class TestPyPDFToDocument:
|
||
def test_init(self, pypdf_component):
|
||
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
|
||
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
|
||
assert pypdf_component.plain_mode_space_width == 200.0
|
||
assert pypdf_component.layout_mode_space_vertically is True
|
||
assert pypdf_component.layout_mode_scale_weight == 1.25
|
||
assert pypdf_component.layout_mode_strip_rotated is True
|
||
assert pypdf_component.layout_mode_font_height_weight == 1.0
|
||
|
||
def test_init_custom_params(self):
|
||
pypdf_component = PyPDFToDocument(
|
||
extraction_mode="layout",
|
||
plain_mode_orientations=(0, 90),
|
||
plain_mode_space_width=150.0,
|
||
layout_mode_space_vertically=False,
|
||
layout_mode_scale_weight=2.0,
|
||
layout_mode_strip_rotated=False,
|
||
layout_mode_font_height_weight=0.5,
|
||
)
|
||
|
||
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
|
||
assert pypdf_component.plain_mode_orientations == (0, 90)
|
||
assert pypdf_component.plain_mode_space_width == 150.0
|
||
assert pypdf_component.layout_mode_space_vertically is False
|
||
assert pypdf_component.layout_mode_scale_weight == 2.0
|
||
assert pypdf_component.layout_mode_strip_rotated is False
|
||
assert pypdf_component.layout_mode_font_height_weight == 0.5
|
||
|
||
def test_init_invalid_extraction_mode(self):
|
||
with pytest.raises(ValueError):
|
||
PyPDFToDocument(extraction_mode="invalid")
|
||
|
||
def test_to_dict(self, pypdf_component):
|
||
data = pypdf_component.to_dict()
|
||
assert data == {
|
||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||
"init_parameters": {
|
||
"extraction_mode": "plain",
|
||
"plain_mode_orientations": (0, 90, 180, 270),
|
||
"plain_mode_space_width": 200.0,
|
||
"layout_mode_space_vertically": True,
|
||
"layout_mode_scale_weight": 1.25,
|
||
"layout_mode_strip_rotated": True,
|
||
"layout_mode_font_height_weight": 1.0,
|
||
"store_full_path": False,
|
||
},
|
||
}
|
||
|
||
def test_from_dict(self):
|
||
data = {
|
||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||
"init_parameters": {
|
||
"extraction_mode": "plain",
|
||
"plain_mode_orientations": (0, 90, 180, 270),
|
||
"plain_mode_space_width": 200.0,
|
||
"layout_mode_space_vertically": True,
|
||
"layout_mode_scale_weight": 1.25,
|
||
"layout_mode_strip_rotated": True,
|
||
"layout_mode_font_height_weight": 1.0,
|
||
},
|
||
}
|
||
|
||
instance = PyPDFToDocument.from_dict(data)
|
||
assert isinstance(instance, PyPDFToDocument)
|
||
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
|
||
assert instance.plain_mode_orientations == (0, 90, 180, 270)
|
||
assert instance.plain_mode_space_width == 200.0
|
||
assert instance.layout_mode_space_vertically is True
|
||
assert instance.layout_mode_scale_weight == 1.25
|
||
assert instance.layout_mode_strip_rotated is True
|
||
assert instance.layout_mode_font_height_weight == 1.0
|
||
|
||
def test_from_dict_defaults(self):
|
||
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
|
||
instance = PyPDFToDocument.from_dict(data)
|
||
assert isinstance(instance, PyPDFToDocument)
|
||
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
|
||
|
||
def test_default_convert(self):
|
||
mock_page1 = Mock()
|
||
mock_page2 = Mock()
|
||
mock_page1.extract_text.return_value = "Page 1 content"
|
||
mock_page2.extract_text.return_value = "Page 2 content"
|
||
mock_reader = Mock()
|
||
mock_reader.pages = [mock_page1, mock_page2]
|
||
|
||
converter = PyPDFToDocument(
|
||
extraction_mode="layout",
|
||
plain_mode_orientations=(0, 90),
|
||
plain_mode_space_width=150.0,
|
||
layout_mode_space_vertically=False,
|
||
layout_mode_scale_weight=2.0,
|
||
layout_mode_strip_rotated=False,
|
||
layout_mode_font_height_weight=1.5,
|
||
)
|
||
|
||
text = converter._default_convert(mock_reader)
|
||
assert text == "Page 1 content\fPage 2 content"
|
||
|
||
expected_params = {
|
||
"extraction_mode": "layout",
|
||
"orientations": (0, 90),
|
||
"space_width": 150.0,
|
||
"layout_mode_space_vertically": False,
|
||
"layout_mode_scale_weight": 2.0,
|
||
"layout_mode_strip_rotated": False,
|
||
"layout_mode_font_height_weight": 1.5,
|
||
}
|
||
for mock_page in mock_reader.pages:
|
||
mock_page.extract_text.assert_called_once_with(**expected_params)
|
||
|
||
@pytest.mark.integration
|
||
def test_run(self, test_files_path, pypdf_component):
|
||
"""
|
||
Test if the component runs correctly.
|
||
"""
|
||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||
output = pypdf_component.run(sources=paths)
|
||
docs = output["documents"]
|
||
assert len(docs) == 1
|
||
assert "History" in docs[0].content
|
||
|
||
@pytest.mark.integration
|
||
def test_page_breaks_added(self, test_files_path, pypdf_component):
|
||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||
output = pypdf_component.run(sources=paths)
|
||
docs = output["documents"]
|
||
assert len(docs) == 1
|
||
assert docs[0].content.count("\f") == 3
|
||
|
||
def test_run_with_meta(self, test_files_path, pypdf_component):
|
||
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
||
|
||
with patch("haystack.components.converters.pypdf.PdfReader"):
|
||
output = pypdf_component.run(
|
||
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
||
)
|
||
|
||
# check that the metadata from the bytestream is merged with that from the meta parameter
|
||
assert output["documents"][0].meta["author"] == "test_author"
|
||
assert output["documents"][0].meta["language"] == "it"
|
||
assert output["documents"][1].meta["language"] == "it"
|
||
|
||
def test_run_with_store_full_path_false(self, test_files_path):
|
||
"""
|
||
Test if the component runs correctly with store_full_path=False
|
||
"""
|
||
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||
converter = PyPDFToDocument(store_full_path=True)
|
||
results = converter.run(sources=sources)
|
||
docs = results["documents"]
|
||
|
||
assert len(docs) == 1
|
||
assert docs[0].meta["file_path"] == str(sources[0])
|
||
|
||
converter = PyPDFToDocument(store_full_path=False)
|
||
results = converter.run(sources=sources)
|
||
docs = results["documents"]
|
||
|
||
assert len(docs) == 1
|
||
assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"
|
||
|
||
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
|
||
"""
|
||
Test if the component correctly handles errors.
|
||
"""
|
||
paths = ["non_existing_file.pdf"]
|
||
with caplog.at_level(logging.WARNING):
|
||
pypdf_component.run(sources=paths)
|
||
assert "Could not read non_existing_file.pdf" in caplog.text
|
||
|
||
@pytest.mark.integration
|
||
def test_mixed_sources_run(self, test_files_path, pypdf_component):
|
||
"""
|
||
Test if the component runs correctly when mixed sources are provided.
|
||
"""
|
||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
|
||
paths.append(ByteStream(f.read()))
|
||
|
||
output = pypdf_component.run(sources=paths)
|
||
docs = output["documents"]
|
||
assert len(docs) == 2
|
||
assert "History and standardization" in docs[0].content
|
||
assert "History and standardization" in docs[1].content
|
||
|
||
def test_run_empty_document(self, caplog, test_files_path):
|
||
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
|
||
with caplog.at_level(logging.WARNING):
|
||
output = PyPDFToDocument().run(sources=paths)
|
||
assert "PyPDFToDocument could not extract text from the file" in caplog.text
|
||
assert output["documents"][0].content == ""
|
||
|
||
# Check that meta is used when the returned document is initialized and thus when doc id is generated
|
||
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
|
||
assert output["documents"][0].id != Document(content="").id
|
||
|
||
def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
|
||
converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
|
||
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
|
||
pdf_doc = converter.run(sources=sources)
|
||
splitter = DocumentSplitter(split_length=1, split_by="passage")
|
||
docs = splitter.run(pdf_doc["documents"])
|
||
|
||
assert len(docs["documents"]) == 51
|
||
|
||
expected = (
|
||
"A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
|
||
"edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
|
||
"multiple pages for the subjects or scope of the project and may be either open\nto the public or "
|
||
"limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
|
||
"enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
|
||
"content management system, differs from other web-based systems\nsuch as blog software, in that "
|
||
"the content is created without any defined owner or leader,\nand wikis have little inherent "
|
||
"structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
|
||
)
|
||
assert docs["documents"][2].content == expected
|