diff --git a/haystack/preview/components/converters/pypdf.py b/haystack/preview/components/converters/pypdf.py index 2568b9bc4..ede7da381 100644 --- a/haystack/preview/components/converters/pypdf.py +++ b/haystack/preview/components/converters/pypdf.py @@ -1,11 +1,11 @@ import io import logging -from typing import List, Union, Optional, Protocol +from typing import List, Union, Protocol, Dict from pathlib import Path from haystack.preview.dataclasses import ByteStream from haystack.preview.lazy_imports import LazyImport -from haystack.preview import Document, component +from haystack.preview import Document, component, default_to_dict with LazyImport("Run 'pip install pypdf'") as pypdf_import: from pypdf import PdfReader @@ -34,6 +34,11 @@ class DefaultConverter: return Document(content=text) +# This registry is used to store converters names and instances. +# It can be used to register custom converters. +CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()} + + @component class PyPDFToDocument: """ @@ -42,14 +47,27 @@ class PyPDFToDocument: A default text extraction converter is used if no custom converter is provided. """ - def __init__(self, converter: Optional[PyPDFConverter] = None): + def __init__(self, converter_name: str = "default"): """ Initializes the PyPDFToDocument component with an optional custom converter. - :param converter: A converter instance that adheres to the PyPDFConverter protocol. - If None, the DefaultConverter is used. + :param converter_name: A converter name that is registered in the CONVERTERS_REGISTRY. + Defaults to 'default'. """ pypdf_import.check() - self.converter: PyPDFConverter = converter or DefaultConverter() + + try: + converter = CONVERTERS_REGISTRY[converter_name] + except KeyError: + msg = ( + f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}" + ) + raise ValueError(msg) from KeyError + self.converter_name = converter_name + self._converter: PyPDFConverter = converter + + def to_dict(self): + # do not serialize the _converter instance + return default_to_dict(self, converter_name=self.converter_name) @component.output_types(documents=List[Document]) def run(self, sources: List[Union[str, Path, ByteStream]]): @@ -63,7 +81,7 @@ class PyPDFToDocument: for source in sources: try: pdf_reader = self._get_pdf_reader(source) - document = self.converter.convert(pdf_reader) + document = self._converter.convert(pdf_reader) except Exception as e: logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e) continue diff --git a/releasenotes/notes/fix-pypdf-serialization-93744fd01ef5b841.yaml b/releasenotes/notes/fix-pypdf-serialization-93744fd01ef5b841.yaml new file mode 100644 index 000000000..04d042c6f --- /dev/null +++ b/releasenotes/notes/fix-pypdf-serialization-93744fd01ef5b841.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Make `PyPDFToDocument` accept a `converter_name` parameter instead of + a `converter` instance, to allow a smooth serialization of the component. diff --git a/test/preview/components/converters/test_pypdf_to_document.py b/test/preview/components/converters/test_pypdf_to_document.py index 4e4283ca4..e7fb0202f 100644 --- a/test/preview/components/converters/test_pypdf_to_document.py +++ b/test/preview/components/converters/test_pypdf_to_document.py @@ -3,11 +3,20 @@ import pytest from pypdf import PdfReader from haystack.preview import Document -from haystack.preview.components.converters.pypdf import PyPDFToDocument +from haystack.preview.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY from haystack.preview.dataclasses import ByteStream class TestPyPDFToDocument: + def test_init(self): + component = PyPDFToDocument() + assert component.converter_name == "default" + assert hasattr(component, "_converter") + + def test_init_fail_nonexisting_converter(self): + with pytest.raises(ValueError): + PyPDFToDocument(converter_name="non_existing_converter") + @pytest.mark.unit def test_run(self, preview_samples_path): """ @@ -58,7 +67,9 @@ class TestPyPDFToDocument: def convert(self, reader: PdfReader) -> Document: return Document(content="I don't care about converting given pdfs, I always return this") - converter = PyPDFToDocument(converter=MyCustomConverter()) + CONVERTERS_REGISTRY["custom"] = MyCustomConverter() + + converter = PyPDFToDocument(converter_name="custom") output = converter.run(sources=paths) docs = output["documents"] assert len(docs) == 1