haystack/test/components/converters/test_pypdf_to_document.py
2025-05-26 13:41:36 +00:00

239 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch, Mock
import pytest
from haystack import Document
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
from haystack.components.preprocessors import DocumentSplitter
from haystack.dataclasses import ByteStream
@pytest.fixture
def pypdf_component():
return PyPDFToDocument()
class TestPyPDFToDocument:
def test_init(self, pypdf_component):
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
assert pypdf_component.plain_mode_space_width == 200.0
assert pypdf_component.layout_mode_space_vertically is True
assert pypdf_component.layout_mode_scale_weight == 1.25
assert pypdf_component.layout_mode_strip_rotated is True
assert pypdf_component.layout_mode_font_height_weight == 1.0
def test_init_custom_params(self):
pypdf_component = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=0.5,
)
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
assert pypdf_component.plain_mode_orientations == (0, 90)
assert pypdf_component.plain_mode_space_width == 150.0
assert pypdf_component.layout_mode_space_vertically is False
assert pypdf_component.layout_mode_scale_weight == 2.0
assert pypdf_component.layout_mode_strip_rotated is False
assert pypdf_component.layout_mode_font_height_weight == 0.5
def test_init_invalid_extraction_mode(self):
with pytest.raises(ValueError):
PyPDFToDocument(extraction_mode="invalid")
def test_to_dict(self, pypdf_component):
data = pypdf_component.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
"store_full_path": False,
},
}
def test_from_dict(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"extraction_mode": "plain",
"plain_mode_orientations": (0, 90, 180, 270),
"plain_mode_space_width": 200.0,
"layout_mode_space_vertically": True,
"layout_mode_scale_weight": 1.25,
"layout_mode_strip_rotated": True,
"layout_mode_font_height_weight": 1.0,
},
}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
assert instance.plain_mode_orientations == (0, 90, 180, 270)
assert instance.plain_mode_space_width == 200.0
assert instance.layout_mode_space_vertically is True
assert instance.layout_mode_scale_weight == 1.25
assert instance.layout_mode_strip_rotated is True
assert instance.layout_mode_font_height_weight == 1.0
def test_from_dict_defaults(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
def test_default_convert(self):
mock_page1 = Mock()
mock_page2 = Mock()
mock_page1.extract_text.return_value = "Page 1 content"
mock_page2.extract_text.return_value = "Page 2 content"
mock_reader = Mock()
mock_reader.pages = [mock_page1, mock_page2]
converter = PyPDFToDocument(
extraction_mode="layout",
plain_mode_orientations=(0, 90),
plain_mode_space_width=150.0,
layout_mode_space_vertically=False,
layout_mode_scale_weight=2.0,
layout_mode_strip_rotated=False,
layout_mode_font_height_weight=1.5,
)
text = converter._default_convert(mock_reader)
assert text == "Page 1 content\fPage 2 content"
expected_params = {
"extraction_mode": "layout",
"orientations": (0, 90),
"space_width": 150.0,
"layout_mode_space_vertically": False,
"layout_mode_scale_weight": 2.0,
"layout_mode_strip_rotated": False,
"layout_mode_font_height_weight": 1.5,
}
for mock_page in mock_reader.pages:
mock_page.extract_text.assert_called_once_with(**expected_params)
@pytest.mark.integration
def test_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
@pytest.mark.integration
def test_page_breaks_added(self, test_files_path, pypdf_component):
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3
def test_run_with_meta(self, test_files_path, pypdf_component):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
with patch("haystack.components.converters.pypdf.PdfReader"):
output = pypdf_component.run(
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
)
# check that the metadata from the bytestream is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"
def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
converter = PyPDFToDocument(store_full_path=True)
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
assert docs[0].meta["file_path"] == str(sources[0])
converter = PyPDFToDocument(store_full_path=False)
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.pdf"]
with caplog.at_level(logging.WARNING):
pypdf_component.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text
@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, pypdf_component):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
paths.append(ByteStream(f.read()))
output = pypdf_component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
def test_run_empty_document(self, caplog, test_files_path):
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
with caplog.at_level(logging.WARNING):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""
# Check that meta is used when the returned document is initialized and thus when doc id is generated
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
assert output["documents"][0].id != Document(content="").id
def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
pdf_doc = converter.run(sources=sources)
splitter = DocumentSplitter(split_length=1, split_by="passage")
docs = splitter.run(pdf_doc["documents"])
assert len(docs["documents"]) == 51
expected = (
"A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
"edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
"multiple pages for the subjects or scope of the project and may be either open\nto the public or "
"limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
"enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
"content management system, differs from other web-based systems\nsuch as blog software, in that "
"the content is created without any defined owner or leader,\nand wikis have little inherent "
"structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
)
assert docs["documents"][2].content == expected