feat: Update PyPDFToDocument to process ByteStream inputs (#6021)

* Update PyPDF converter * Add mixed source unit test * Update haystack/preview/components/file_converters/pypdf.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
2026-01-07 20:46:31 +00:00 · 2023-10-11 10:52:08 +02:00 · 2023-10-11 10:52:08 +02:00 · 3803d23ff6
commit 3803d23ff6
parent 1a6a8863e8
2 changed files with 46 additions and 19 deletions
--- a/haystack/preview/components/file_converters/pypdf.py
+++ b/haystack/preview/components/file_converters/pypdf.py
@ -1,7 +1,9 @@
+import io
 import logging
 from typing import List, Optional, Dict, Any, Union
 from pathlib import Path

+from haystack.preview.dataclasses import ByteStream
 from haystack.preview.lazy_imports import LazyImport
 from haystack.preview import Document, component, default_to_dict, default_from_dict

@ -15,12 +17,12 @@ logger = logging.getLogger(__name__)
@component
 class PyPDFToDocument:
    """
-    A component for converting a PDF file to a Document.
+    Converts a PDF file to a Document.
    """

    def __init__(self, id_hash_keys: Optional[List[str]] = None):
        """
-        Create a PyPDFToDocument component.
+        Initializes the PyPDFToDocument component.

        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
            attributes. Default: `None`
@ -31,6 +33,7 @@ class PyPDFToDocument:
    def to_dict(self) -> Dict[str, Any]:
        """
        Serialize this component to a dictionary.
+        :return: The dictionary containing the component's data.
        """
        return default_to_dict(self, id_hash_keys=self.id_hash_keys)

@ -38,25 +41,27 @@ class PyPDFToDocument:
    def from_dict(cls, data: Dict[str, Any]) -> "PyPDFToDocument":
        """
        Deserialize this component from a dictionary.
+        :param data: The dictionary containing the component's data.
+        :return: The component instance.
        """
        return default_from_dict(cls, data)

    @component.output_types(documents=List[Document])
-    def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
+    def run(self, sources: List[Union[str, Path, ByteStream]], id_hash_keys: Optional[List[str]] = None):
        """
-        Convert PDF files to Documents.
+        Converts PDF files to Documents.

-        :param paths: A list of paths to PDF files.
+        :param sources: A list of PDF data sources
        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
            attributes. Default: `None`
        """
        id_hash_keys = id_hash_keys or self.id_hash_keys
        documents = []
-        for path in paths:
+        for source in sources:
            try:
-                text = self._read_pdf_file(path)
+                text = self._read_pdf_file(source)
            except Exception as e:
-                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
+                logger.warning("Could not read %s. Skipping it. Error message: %s", source, e)
                continue

            document = Document(text=text, id_hash_keys=id_hash_keys)
@ -64,14 +69,19 @@ class PyPDFToDocument:

        return {"documents": documents}

-    def _read_pdf_file(self, path: Union[str, Path]) -> str:
+    def _read_pdf_file(self, source: Union[str, Path, ByteStream]) -> str:
        """
-        Read a PDF file and return its text content.
+        Extracts content from the given PDF source.
+        :param source:  PDF file data source
+        :return: The extracted text.
        """
-        pdf_reader = PdfReader(str(path))
-        text = ""
-        for page in pdf_reader.pages:
-            extracted_text = page.extract_text()
-            if extracted_text:
-                text += extracted_text
+        if isinstance(source, (str, Path)):
+            pdf_reader = PdfReader(str(source))
+        elif isinstance(source, ByteStream):
+            pdf_reader = PdfReader(io.BytesIO(source.data))
+        else:
+            raise ValueError(f"Unsupported source type: {type(source)}")
+
+        text = "".join(extracted_text for page in pdf_reader.pages if (extracted_text := page.extract_text()))
+
        return text
--- a/test/preview/components/file_converters/test_pypdf_to_document.py
+++ b/test/preview/components/file_converters/test_pypdf_to_document.py
@ -3,6 +3,7 @@ import logging
 import pytest

 from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
+from haystack.preview.dataclasses import ByteStream


 class TestPyPDFToDocument:
@ -31,7 +32,7 @@ class TestPyPDFToDocument:
        """
        paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
        converter = PyPDFToDocument()
-        output = converter.run(paths=paths)
+        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "ReAct" in docs[0].text
@ -44,5 +45,21 @@ class TestPyPDFToDocument:
        paths = ["non_existing_file.pdf"]
        converter = PyPDFToDocument()
        with caplog.at_level(logging.WARNING):
-            converter.run(paths=paths)
-            assert "Could not read file non_existing_file.pdf" in caplog.text
+            converter.run(sources=paths)
+            assert "Could not read non_existing_file.pdf" in caplog.text
+
+    @pytest.mark.unit
+    def test_mixed_sources_run(self, preview_samples_path):
+        """
+        Test if the component runs correctly when mixed sources are provided.
+        """
+        paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
+        with open(preview_samples_path / "pdf" / "react_paper.pdf", "rb") as f:
+            paths.append(ByteStream(f.read()))
+
+        converter = PyPDFToDocument()
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 2
+        assert "ReAct" in docs[0].text
+        assert "ReAct" in docs[1].text