mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 04:56:45 +00:00
fix!: make PyPDFToDocument JSON-serializable (#6396)
* add registry * release not * add checks * rm superflous check * fix typo * rm print :-)
This commit is contained in:
parent
a492771b4d
commit
b0b514778d
@ -1,11 +1,11 @@
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Union, Optional, Protocol
|
||||
from typing import List, Union, Protocol, Dict
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import Document, component
|
||||
from haystack.preview import Document, component, default_to_dict
|
||||
|
||||
with LazyImport("Run 'pip install pypdf'") as pypdf_import:
|
||||
from pypdf import PdfReader
|
||||
@ -34,6 +34,11 @@ class DefaultConverter:
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
# This registry is used to store converters names and instances.
|
||||
# It can be used to register custom converters.
|
||||
CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()}
|
||||
|
||||
|
||||
@component
|
||||
class PyPDFToDocument:
|
||||
"""
|
||||
@ -42,14 +47,27 @@ class PyPDFToDocument:
|
||||
A default text extraction converter is used if no custom converter is provided.
|
||||
"""
|
||||
|
||||
def __init__(self, converter: Optional[PyPDFConverter] = None):
|
||||
def __init__(self, converter_name: str = "default"):
|
||||
"""
|
||||
Initializes the PyPDFToDocument component with an optional custom converter.
|
||||
:param converter: A converter instance that adheres to the PyPDFConverter protocol.
|
||||
If None, the DefaultConverter is used.
|
||||
:param converter_name: A converter name that is registered in the CONVERTERS_REGISTRY.
|
||||
Defaults to 'default'.
|
||||
"""
|
||||
pypdf_import.check()
|
||||
self.converter: PyPDFConverter = converter or DefaultConverter()
|
||||
|
||||
try:
|
||||
converter = CONVERTERS_REGISTRY[converter_name]
|
||||
except KeyError:
|
||||
msg = (
|
||||
f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}"
|
||||
)
|
||||
raise ValueError(msg) from KeyError
|
||||
self.converter_name = converter_name
|
||||
self._converter: PyPDFConverter = converter
|
||||
|
||||
def to_dict(self):
|
||||
# do not serialize the _converter instance
|
||||
return default_to_dict(self, converter_name=self.converter_name)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, sources: List[Union[str, Path, ByteStream]]):
|
||||
@ -63,7 +81,7 @@ class PyPDFToDocument:
|
||||
for source in sources:
|
||||
try:
|
||||
pdf_reader = self._get_pdf_reader(source)
|
||||
document = self.converter.convert(pdf_reader)
|
||||
document = self._converter.convert(pdf_reader)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
|
||||
continue
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Make `PyPDFToDocument` accept a `converter_name` parameter instead of
|
||||
a `converter` instance, to allow a smooth serialization of the component.
|
||||
@ -3,11 +3,20 @@ import pytest
|
||||
from pypdf import PdfReader
|
||||
|
||||
from haystack.preview import Document
|
||||
from haystack.preview.components.converters.pypdf import PyPDFToDocument
|
||||
from haystack.preview.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
|
||||
|
||||
class TestPyPDFToDocument:
|
||||
def test_init(self):
|
||||
component = PyPDFToDocument()
|
||||
assert component.converter_name == "default"
|
||||
assert hasattr(component, "_converter")
|
||||
|
||||
def test_init_fail_nonexisting_converter(self):
|
||||
with pytest.raises(ValueError):
|
||||
PyPDFToDocument(converter_name="non_existing_converter")
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run(self, preview_samples_path):
|
||||
"""
|
||||
@ -58,7 +67,9 @@ class TestPyPDFToDocument:
|
||||
def convert(self, reader: PdfReader) -> Document:
|
||||
return Document(content="I don't care about converting given pdfs, I always return this")
|
||||
|
||||
converter = PyPDFToDocument(converter=MyCustomConverter())
|
||||
CONVERTERS_REGISTRY["custom"] = MyCustomConverter()
|
||||
|
||||
converter = PyPDFToDocument(converter_name="custom")
|
||||
output = converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user