feat: Update PyPDFToDocument to process ByteStream inputs (#6021)

* Update PyPDF converter

* Add mixed source unit test

* Update haystack/preview/components/file_converters/pypdf.py

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Vladimir Blagojevic 2023-10-11 10:52:08 +02:00 committed by GitHub
parent 1a6a8863e8
commit 3803d23ff6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 46 additions and 19 deletions

View File

@ -1,7 +1,9 @@
import io
import logging
from typing import List, Optional, Dict, Any, Union
from pathlib import Path
from haystack.preview.dataclasses import ByteStream
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import Document, component, default_to_dict, default_from_dict
@ -15,12 +17,12 @@ logger = logging.getLogger(__name__)
@component
class PyPDFToDocument:
"""
A component for converting a PDF file to a Document.
Converts a PDF file to a Document.
"""
def __init__(self, id_hash_keys: Optional[List[str]] = None):
"""
Create a PyPDFToDocument component.
Initializes the PyPDFToDocument component.
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. Default: `None`
@ -31,6 +33,7 @@ class PyPDFToDocument:
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
:return: The dictionary containing the component's data.
"""
return default_to_dict(self, id_hash_keys=self.id_hash_keys)
@ -38,25 +41,27 @@ class PyPDFToDocument:
def from_dict(cls, data: Dict[str, Any]) -> "PyPDFToDocument":
"""
Deserialize this component from a dictionary.
:param data: The dictionary containing the component's data.
:return: The component instance.
"""
return default_from_dict(cls, data)
@component.output_types(documents=List[Document])
def run(self, paths: List[Union[str, Path]], id_hash_keys: Optional[List[str]] = None):
def run(self, sources: List[Union[str, Path, ByteStream]], id_hash_keys: Optional[List[str]] = None):
"""
Convert PDF files to Documents.
Converts PDF files to Documents.
:param paths: A list of paths to PDF files.
:param sources: A list of PDF data sources
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. Default: `None`
"""
id_hash_keys = id_hash_keys or self.id_hash_keys
documents = []
for path in paths:
for source in sources:
try:
text = self._read_pdf_file(path)
text = self._read_pdf_file(source)
except Exception as e:
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
logger.warning("Could not read %s. Skipping it. Error message: %s", source, e)
continue
document = Document(text=text, id_hash_keys=id_hash_keys)
@ -64,14 +69,19 @@ class PyPDFToDocument:
return {"documents": documents}
def _read_pdf_file(self, path: Union[str, Path]) -> str:
def _read_pdf_file(self, source: Union[str, Path, ByteStream]) -> str:
"""
Read a PDF file and return its text content.
Extracts content from the given PDF source.
:param source: PDF file data source
:return: The extracted text.
"""
pdf_reader = PdfReader(str(path))
text = ""
for page in pdf_reader.pages:
extracted_text = page.extract_text()
if extracted_text:
text += extracted_text
if isinstance(source, (str, Path)):
pdf_reader = PdfReader(str(source))
elif isinstance(source, ByteStream):
pdf_reader = PdfReader(io.BytesIO(source.data))
else:
raise ValueError(f"Unsupported source type: {type(source)}")
text = "".join(extracted_text for page in pdf_reader.pages if (extracted_text := page.extract_text()))
return text

View File

@ -3,6 +3,7 @@ import logging
import pytest
from haystack.preview.components.file_converters.pypdf import PyPDFToDocument
from haystack.preview.dataclasses import ByteStream
class TestPyPDFToDocument:
@ -31,7 +32,7 @@ class TestPyPDFToDocument:
"""
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
converter = PyPDFToDocument()
output = converter.run(paths=paths)
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" in docs[0].text
@ -44,5 +45,21 @@ class TestPyPDFToDocument:
paths = ["non_existing_file.pdf"]
converter = PyPDFToDocument()
with caplog.at_level(logging.WARNING):
converter.run(paths=paths)
assert "Could not read file non_existing_file.pdf" in caplog.text
converter.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text
@pytest.mark.unit
def test_mixed_sources_run(self, preview_samples_path):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [preview_samples_path / "pdf" / "react_paper.pdf"]
with open(preview_samples_path / "pdf" / "react_paper.pdf", "rb") as f:
paths.append(ByteStream(f.read()))
converter = PyPDFToDocument()
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "ReAct" in docs[0].text
assert "ReAct" in docs[1].text