feat: Add PyPDFToDocument component (2.0) (#5850)

* Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
2025-06-26 22:00:13 +00:00 · 2023-09-21 11:52:26 +02:00 · 2023-09-21 11:52:26 +02:00 · 92a6221927
commit 92a6221927
parent 23fdef929e
5 changed files with 130 additions and 0 deletions
--- a/haystack/preview/components/file_converters/pypdf.py
+++ b/haystack/preview/components/file_converters/pypdf.py
@ -0,0 +1,77 @@
+import logging
+from typing import List, Optional, Dict, Any, Union
+from pathlib import Path
+
+from haystack.preview.lazy_imports import LazyImport
+from haystack.preview import Document, component, default_to_dict, default_from_dict
+
+with LazyImport("Run 'pip install pypdf'") as pypdf_import:
+    from pypdf import PdfReader
+
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PyPDFToDocument:
+    """
+    A component for converting a PDF file to a Document.
+    """
+
+    def __init__(self, id_hash_keys: Optional[List[str]] = None):
+        """
+        Create a PyPDFToDocument component.
+
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. Default: `None`
+        """
+        pypdf_import.check()
+        self.id_hash_keys = id_hash_keys or []
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(self, id_hash_keys=self.id_hash_keys)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "PyPDFToDocument":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=List[Document])
+    def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
+        """
+        Convert PDF files to Documents.
+
+        :param paths: A list of paths to PDF files.
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. Default: `None`
+        """
+        id_hash_keys = id_hash_keys or self.id_hash_keys
+        documents = []
+        for path in paths:
+            try:
+                text = self._read_pdf_file(path)
+            except Exception as e:
+                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
+                continue
+
+            document = Document(text=text, id_hash_keys=id_hash_keys)
+            documents.append(document)
+
+        return {"documents": documents}
+
+    def _read_pdf_file(self, path: Union[str, Path]) -> str:
+        """
+        Read a PDF file and return its text content.
+        """
+        pdf_reader = PdfReader(str(path))
+        text = ""
+        for page in pdf_reader.pages:
+            extracted_text = page.extract_text()
+            if extracted_text:
+                text += extracted_text
+        return text
--- a/pyproject.toml
+++ b/pyproject.toml
@ -83,6 +83,7 @@ dependencies = [
  "openai",
  "Jinja2",
  "openai-whisper",  # FIXME https://github.com/deepset-ai/haystack/issues/5731
+  "pypdf",

  # Agent events
  "events",
--- a/releasenotes/notes/add-pypdf-to-document-converter-4a39c29abc4da7ba.yaml
+++ b/releasenotes/notes/add-pypdf-to-document-converter-4a39c29abc4da7ba.yaml
@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Adds support for PDF files to the Document converter via pypdf library.
--- a/test/preview/components/file_converters/test_pypdf_to_document.py
+++ b/test/preview/components/file_converters/test_pypdf_to_document.py
@ -0,0 +1,48 @@
+import logging
+
+import pytest
+
+from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
+
+
+class TestPyPDFToDocument:
+    @pytest.mark.unit
+    def test_to_dict(self):
+        component = PyPDFToDocument()
+        data = component.to_dict()
+        assert data == {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": []}}
+
+    @pytest.mark.unit
+    def test_to_dict_with_custom_init_parameters(self):
+        component = PyPDFToDocument(id_hash_keys=["name"])
+        data = component.to_dict()
+        assert data == {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        data = {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
+        component = PyPDFToDocument.from_dict(data)
+        assert component.id_hash_keys == ["name"]
+
+    @pytest.mark.unit
+    def test_run(self, preview_samples_path):
+        """
+        Test if the component runs correctly.
+        """
+        paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
+        converter = PyPDFToDocument()
+        output = converter.run(paths=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "ReAct" in docs[0].text
+
+    @pytest.mark.unit
+    def test_run_error_handling(self, preview_samples_path, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = ["non_existing_file.pdf"]
+        converter = PyPDFToDocument()
+        with caplog.at_level(logging.WARNING):
+            converter.run(paths=paths)
+            assert "Could not read file non_existing_file.pdf" in caplog.text
--- a/test/preview/test_files/pdf/react_paper.pdf
+++ b/test/preview/test_files/pdf/react_paper.pdf