mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
feat: Add PyPDFToDocument component (2.0) (#5850)
* Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
23fdef929e
commit
92a6221927
77
haystack/preview/components/file_converters/pypdf.py
Normal file
77
haystack/preview/components/file_converters/pypdf.py
Normal file
@ -0,0 +1,77 @@
|
||||
import logging
|
||||
from typing import List, Optional, Dict, Any, Union
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import Document, component, default_to_dict, default_from_dict
|
||||
|
||||
with LazyImport("Run 'pip install pypdf'") as pypdf_import:
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class PyPDFToDocument:
|
||||
"""
|
||||
A component for converting a PDF file to a Document.
|
||||
"""
|
||||
|
||||
def __init__(self, id_hash_keys: Optional[List[str]] = None):
|
||||
"""
|
||||
Create a PyPDFToDocument component.
|
||||
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. Default: `None`
|
||||
"""
|
||||
pypdf_import.check()
|
||||
self.id_hash_keys = id_hash_keys or []
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
"""
|
||||
return default_to_dict(self, id_hash_keys=self.id_hash_keys)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "PyPDFToDocument":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
"""
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
|
||||
"""
|
||||
Convert PDF files to Documents.
|
||||
|
||||
:param paths: A list of paths to PDF files.
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. Default: `None`
|
||||
"""
|
||||
id_hash_keys = id_hash_keys or self.id_hash_keys
|
||||
documents = []
|
||||
for path in paths:
|
||||
try:
|
||||
text = self._read_pdf_file(path)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
|
||||
continue
|
||||
|
||||
document = Document(text=text, id_hash_keys=id_hash_keys)
|
||||
documents.append(document)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
def _read_pdf_file(self, path: Union[str, Path]) -> str:
|
||||
"""
|
||||
Read a PDF file and return its text content.
|
||||
"""
|
||||
pdf_reader = PdfReader(str(path))
|
||||
text = ""
|
||||
for page in pdf_reader.pages:
|
||||
extracted_text = page.extract_text()
|
||||
if extracted_text:
|
||||
text += extracted_text
|
||||
return text
|
@ -83,6 +83,7 @@ dependencies = [
|
||||
"openai",
|
||||
"Jinja2",
|
||||
"openai-whisper", # FIXME https://github.com/deepset-ai/haystack/issues/5731
|
||||
"pypdf",
|
||||
|
||||
# Agent events
|
||||
"events",
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Adds support for PDF files to the Document converter via pypdf library.
|
@ -0,0 +1,48 @@
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
|
||||
|
||||
|
||||
class TestPyPDFToDocument:
|
||||
@pytest.mark.unit
|
||||
def test_to_dict(self):
|
||||
component = PyPDFToDocument()
|
||||
data = component.to_dict()
|
||||
assert data == {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": []}}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_to_dict_with_custom_init_parameters(self):
|
||||
component = PyPDFToDocument(id_hash_keys=["name"])
|
||||
data = component.to_dict()
|
||||
assert data == {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_from_dict(self):
|
||||
data = {"type": "PyPDFToDocument", "init_parameters": {"id_hash_keys": ["name"]}}
|
||||
component = PyPDFToDocument.from_dict(data)
|
||||
assert component.id_hash_keys == ["name"]
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run(self, preview_samples_path):
|
||||
"""
|
||||
Test if the component runs correctly.
|
||||
"""
|
||||
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
|
||||
converter = PyPDFToDocument()
|
||||
output = converter.run(paths=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "ReAct" in docs[0].text
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_error_handling(self, preview_samples_path, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
paths = ["non_existing_file.pdf"]
|
||||
converter = PyPDFToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter.run(paths=paths)
|
||||
assert "Could not read file non_existing_file.pdf" in caplog.text
|
BIN
test/preview/test_files/pdf/react_paper.pdf
Normal file
BIN
test/preview/test_files/pdf/react_paper.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user