mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
feat: PyPDFToDocument
- add new customization parameters (#8574)
* deprecat converter in pypdf * fix linting of MetaFieldGroupingRanker * linting * pypdftodocument: add customization params * fix mypy * incorporate feedback
This commit is contained in:
parent
2440a5ee17
commit
fb42c035c5
@ -4,6 +4,7 @@
|
||||
|
||||
import io
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Protocol, Union
|
||||
|
||||
@ -39,6 +40,33 @@ class PyPDFConverter(Protocol):
|
||||
...
|
||||
|
||||
|
||||
class PyPDFExtractionMode(Enum):
|
||||
"""
|
||||
The mode to use for extracting text from a PDF.
|
||||
"""
|
||||
|
||||
PLAIN = "plain"
|
||||
LAYOUT = "layout"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Convert a PyPDFExtractionMode enum to a string.
|
||||
"""
|
||||
return self.value
|
||||
|
||||
@staticmethod
|
||||
def from_str(string: str) -> "PyPDFExtractionMode":
|
||||
"""
|
||||
Convert a string to a PyPDFExtractionMode enum.
|
||||
"""
|
||||
enum_map = {e.value: e for e in PyPDFExtractionMode}
|
||||
mode = enum_map.get(string)
|
||||
if mode is None:
|
||||
msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
|
||||
raise ValueError(msg)
|
||||
return mode
|
||||
|
||||
|
||||
@component
|
||||
class PyPDFToDocument:
|
||||
"""
|
||||
@ -60,13 +88,49 @@ class PyPDFToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, converter: Optional[PyPDFConverter] = None):
|
||||
def __init__(
|
||||
self,
|
||||
converter: Optional[PyPDFConverter] = None,
|
||||
*,
|
||||
extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
|
||||
plain_mode_orientations: tuple = (0, 90, 180, 270),
|
||||
plain_mode_space_width: float = 200.0,
|
||||
layout_mode_space_vertically: bool = True,
|
||||
layout_mode_scale_weight: float = 1.25,
|
||||
layout_mode_strip_rotated: bool = True,
|
||||
layout_mode_font_height_weight: float = 1.0,
|
||||
):
|
||||
"""
|
||||
Create an PyPDFToDocument component.
|
||||
|
||||
:param converter:
|
||||
An instance of a PyPDFConverter compatible class. This is deprecated and will be removed in Haystack 2.9.0.
|
||||
For in-depth customization of the conversion process, consider implementing a custom component.
|
||||
|
||||
All the following parameters are applied only if `converter` is None.
|
||||
|
||||
:param extraction_mode:
|
||||
The mode to use for extracting text from a PDF.
|
||||
Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
|
||||
:param plain_mode_orientations:
|
||||
Tuple of orientations to look for when extracting text from a PDF in plain mode.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
|
||||
:param plain_mode_space_width:
|
||||
Forces default space width if not extracted from font.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
|
||||
:param layout_mode_space_vertically:
|
||||
Whether to include blank lines inferred from y distance + font height.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
|
||||
:param layout_mode_scale_weight:
|
||||
Multiplier for string length when calculating weighted average character width.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
|
||||
:param layout_mode_strip_rotated:
|
||||
Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
|
||||
If rotated text is discovered, layout will be degraded and a warning will be logged.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
|
||||
:param layout_mode_font_height_weight:
|
||||
Multiplier for font height when calculating blank line height.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
|
||||
"""
|
||||
pypdf_import.check()
|
||||
|
||||
@ -79,6 +143,16 @@ class PyPDFToDocument:
|
||||
|
||||
self.converter = converter
|
||||
|
||||
if isinstance(extraction_mode, str):
|
||||
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
|
||||
self.extraction_mode = extraction_mode
|
||||
self.plain_mode_orientations = plain_mode_orientations
|
||||
self.plain_mode_space_width = plain_mode_space_width
|
||||
self.layout_mode_space_vertically = layout_mode_space_vertically
|
||||
self.layout_mode_scale_weight = layout_mode_scale_weight
|
||||
self.layout_mode_strip_rotated = layout_mode_strip_rotated
|
||||
self.layout_mode_font_height_weight = layout_mode_font_height_weight
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Serializes the component to a dictionary.
|
||||
@ -87,7 +161,15 @@ class PyPDFToDocument:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(
|
||||
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
|
||||
self,
|
||||
converter=(serialize_class_instance(self.converter) if self.converter else None),
|
||||
extraction_mode=str(self.extraction_mode),
|
||||
plain_mode_orientations=self.plain_mode_orientations,
|
||||
plain_mode_space_width=self.plain_mode_space_width,
|
||||
layout_mode_space_vertically=self.layout_mode_space_vertically,
|
||||
layout_mode_scale_weight=self.layout_mode_scale_weight,
|
||||
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
|
||||
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -108,7 +190,20 @@ class PyPDFToDocument:
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
def _default_convert(self, reader: "PdfReader") -> Document:
|
||||
text = "\f".join(page.extract_text() for page in reader.pages)
|
||||
texts = []
|
||||
for page in reader.pages:
|
||||
texts.append(
|
||||
page.extract_text(
|
||||
orientations=self.plain_mode_orientations,
|
||||
extraction_mode=self.extraction_mode.value,
|
||||
space_width=self.plain_mode_space_width,
|
||||
layout_mode_space_vertically=self.layout_mode_space_vertically,
|
||||
layout_mode_scale_weight=self.layout_mode_scale_weight,
|
||||
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
|
||||
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
|
||||
)
|
||||
)
|
||||
text = "\f".join(texts)
|
||||
return Document(content=text)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Added new initialization parameters to the `PyPDFToDocument` component to customize the text extraction process
|
||||
from PDF files.
|
@ -2,17 +2,17 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import logging
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import patch, Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Document, default_from_dict, default_to_dict
|
||||
from haystack.components.converters.pypdf import PyPDFToDocument
|
||||
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
|
||||
from haystack.dataclasses import ByteStream
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pypdf_converter():
|
||||
def pypdf_component():
|
||||
return PyPDFToDocument()
|
||||
|
||||
|
||||
@ -30,38 +30,104 @@ class CustomConverter:
|
||||
|
||||
|
||||
class TestPyPDFToDocument:
|
||||
def test_init(self, pypdf_converter):
|
||||
assert pypdf_converter.converter is None
|
||||
def test_init(self, pypdf_component):
|
||||
assert pypdf_component.converter is None
|
||||
assert pypdf_component.extraction_mode == PyPDFExtractionMode.PLAIN
|
||||
assert pypdf_component.plain_mode_orientations == (0, 90, 180, 270)
|
||||
assert pypdf_component.plain_mode_space_width == 200.0
|
||||
assert pypdf_component.layout_mode_space_vertically is True
|
||||
assert pypdf_component.layout_mode_scale_weight == 1.25
|
||||
assert pypdf_component.layout_mode_strip_rotated is True
|
||||
assert pypdf_component.layout_mode_font_height_weight == 1.0
|
||||
|
||||
def test_init_params(self):
|
||||
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
|
||||
assert isinstance(pypdf_converter.converter, CustomConverter)
|
||||
def test_init_converter(self):
|
||||
pypdf_component = PyPDFToDocument(converter=CustomConverter())
|
||||
assert isinstance(pypdf_component.converter, CustomConverter)
|
||||
|
||||
def test_to_dict(self, pypdf_converter):
|
||||
data = pypdf_converter.to_dict()
|
||||
def test_init_custom_params(self):
|
||||
pypdf_component = PyPDFToDocument(
|
||||
extraction_mode="layout",
|
||||
plain_mode_orientations=(0, 90),
|
||||
plain_mode_space_width=150.0,
|
||||
layout_mode_space_vertically=False,
|
||||
layout_mode_scale_weight=2.0,
|
||||
layout_mode_strip_rotated=False,
|
||||
layout_mode_font_height_weight=0.5,
|
||||
)
|
||||
|
||||
assert pypdf_component.extraction_mode == PyPDFExtractionMode.LAYOUT
|
||||
assert pypdf_component.plain_mode_orientations == (0, 90)
|
||||
assert pypdf_component.plain_mode_space_width == 150.0
|
||||
assert pypdf_component.layout_mode_space_vertically is False
|
||||
assert pypdf_component.layout_mode_scale_weight == 2.0
|
||||
assert pypdf_component.layout_mode_strip_rotated is False
|
||||
assert pypdf_component.layout_mode_font_height_weight == 0.5
|
||||
|
||||
def test_init_invalid_extraction_mode(self):
|
||||
with pytest.raises(ValueError):
|
||||
PyPDFToDocument(extraction_mode="invalid")
|
||||
|
||||
def test_to_dict(self, pypdf_component):
|
||||
data = pypdf_component.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||||
"init_parameters": {"converter": None},
|
||||
"init_parameters": {
|
||||
"converter": None,
|
||||
"extraction_mode": "plain",
|
||||
"plain_mode_orientations": (0, 90, 180, 270),
|
||||
"plain_mode_space_width": 200.0,
|
||||
"layout_mode_space_vertically": True,
|
||||
"layout_mode_scale_weight": 1.25,
|
||||
"layout_mode_strip_rotated": True,
|
||||
"layout_mode_font_height_weight": 1.0,
|
||||
},
|
||||
}
|
||||
|
||||
def test_to_dict_custom_converter(self):
|
||||
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
|
||||
data = pypdf_converter.to_dict()
|
||||
pypdf_component = PyPDFToDocument(converter=CustomConverter())
|
||||
data = pypdf_component.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||||
"init_parameters": {
|
||||
"converter": {
|
||||
"data": {"key": "value", "more": False},
|
||||
"type": "converters.test_pypdf_to_document.CustomConverter",
|
||||
}
|
||||
},
|
||||
"extraction_mode": "plain",
|
||||
"plain_mode_orientations": (0, 90, 180, 270),
|
||||
"plain_mode_space_width": 200.0,
|
||||
"layout_mode_space_vertically": True,
|
||||
"layout_mode_scale_weight": 1.25,
|
||||
"layout_mode_strip_rotated": True,
|
||||
"layout_mode_font_height_weight": 1.0,
|
||||
},
|
||||
}
|
||||
|
||||
def test_from_dict(self):
|
||||
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
|
||||
data = {
|
||||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||||
"init_parameters": {
|
||||
"converter": None,
|
||||
"extraction_mode": "plain",
|
||||
"plain_mode_orientations": (0, 90, 180, 270),
|
||||
"plain_mode_space_width": 200.0,
|
||||
"layout_mode_space_vertically": True,
|
||||
"layout_mode_scale_weight": 1.25,
|
||||
"layout_mode_strip_rotated": True,
|
||||
"layout_mode_font_height_weight": 1.0,
|
||||
},
|
||||
}
|
||||
|
||||
instance = PyPDFToDocument.from_dict(data)
|
||||
assert isinstance(instance, PyPDFToDocument)
|
||||
assert instance.converter is None
|
||||
assert instance.extraction_mode == PyPDFExtractionMode.PLAIN
|
||||
assert instance.plain_mode_orientations == (0, 90, 180, 270)
|
||||
assert instance.plain_mode_space_width == 200.0
|
||||
assert instance.layout_mode_space_vertically is True
|
||||
assert instance.layout_mode_scale_weight == 1.25
|
||||
assert instance.layout_mode_strip_rotated is True
|
||||
assert instance.layout_mode_font_height_weight == 1.0
|
||||
|
||||
def test_from_dict_defaults(self):
|
||||
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
|
||||
@ -83,30 +149,63 @@ class TestPyPDFToDocument:
|
||||
assert isinstance(instance, PyPDFToDocument)
|
||||
assert isinstance(instance.converter, CustomConverter)
|
||||
|
||||
def test_default_convert(self):
|
||||
mock_page1 = Mock()
|
||||
mock_page2 = Mock()
|
||||
mock_page1.extract_text.return_value = "Page 1 content"
|
||||
mock_page2.extract_text.return_value = "Page 2 content"
|
||||
mock_reader = Mock()
|
||||
mock_reader.pages = [mock_page1, mock_page2]
|
||||
|
||||
converter = PyPDFToDocument(
|
||||
extraction_mode="layout",
|
||||
plain_mode_orientations=(0, 90),
|
||||
plain_mode_space_width=150.0,
|
||||
layout_mode_space_vertically=False,
|
||||
layout_mode_scale_weight=2.0,
|
||||
layout_mode_strip_rotated=False,
|
||||
layout_mode_font_height_weight=1.5,
|
||||
)
|
||||
|
||||
doc = converter._default_convert(mock_reader)
|
||||
assert doc.content == "Page 1 content\fPage 2 content"
|
||||
|
||||
expected_params = {
|
||||
"extraction_mode": "layout",
|
||||
"orientations": (0, 90),
|
||||
"space_width": 150.0,
|
||||
"layout_mode_space_vertically": False,
|
||||
"layout_mode_scale_weight": 2.0,
|
||||
"layout_mode_strip_rotated": False,
|
||||
"layout_mode_font_height_weight": 1.5,
|
||||
}
|
||||
for mock_page in mock_reader.pages:
|
||||
mock_page.extract_text.assert_called_once_with(**expected_params)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run(self, test_files_path, pypdf_converter):
|
||||
def test_run(self, test_files_path, pypdf_component):
|
||||
"""
|
||||
Test if the component runs correctly.
|
||||
"""
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
output = pypdf_component.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "History" in docs[0].content
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_page_breaks_added(self, test_files_path, pypdf_converter):
|
||||
def test_page_breaks_added(self, test_files_path, pypdf_component):
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
output = pypdf_component.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert docs[0].content.count("\f") == 3
|
||||
|
||||
def test_run_with_meta(self, test_files_path, pypdf_converter):
|
||||
def test_run_with_meta(self, test_files_path, pypdf_component):
|
||||
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
||||
|
||||
with patch("haystack.components.converters.pypdf.PdfReader"):
|
||||
output = pypdf_converter.run(
|
||||
output = pypdf_component.run(
|
||||
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
||||
)
|
||||
|
||||
@ -115,17 +214,17 @@ class TestPyPDFToDocument:
|
||||
assert output["documents"][0].meta["language"] == "it"
|
||||
assert output["documents"][1].meta["language"] == "it"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
|
||||
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
paths = ["non_existing_file.pdf"]
|
||||
with caplog.at_level(logging.WARNING):
|
||||
pypdf_converter.run(sources=paths)
|
||||
pypdf_component.run(sources=paths)
|
||||
assert "Could not read non_existing_file.pdf" in caplog.text
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
|
||||
def test_mixed_sources_run(self, test_files_path, pypdf_component):
|
||||
"""
|
||||
Test if the component runs correctly when mixed sources are provided.
|
||||
"""
|
||||
@ -133,7 +232,7 @@ class TestPyPDFToDocument:
|
||||
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
|
||||
paths.append(ByteStream(f.read()))
|
||||
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
output = pypdf_component.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 2
|
||||
assert "History and standardization" in docs[0].content
|
||||
|
Loading…
x
Reference in New Issue
Block a user