feat: Add page breaks to default PDF to Document converter (#6755)

* Speedup tests for PyPDFToDocument

* Added unit test and removed skipping of empty pages

* add release note

* Add back some integration marks
This commit is contained in:
Sebastian Husch Lee 2024-01-18 08:54:59 +01:00 committed by GitHub
parent eaec5bfe4a
commit c0b67432e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 44 additions and 27 deletions

View File

@ -31,7 +31,7 @@ class DefaultConverter:
def convert(self, reader: "PdfReader") -> Document:
"""Extract text from the PDF and return a Document object with the text content."""
text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
text = "\f".join(page.extract_text() for page in reader.pages)
return Document(content=text)

View File

@ -28,7 +28,7 @@ def normalize_metadata(
makes sure to return a list of dictionaries of the correct length for the converter to use.
:param meta: the meta input of the converter, as-is
:sources_count: the number of sources the converter received
:param sources_count: the number of sources the converter received
:returns: a list of dictionaries of the make length as the sources list
"""
if meta is None:

View File

@ -0,0 +1,7 @@
---
upgrade:
- |
Upgraded the default converter in PyPDFToDocument to insert page breaks "\f"
between each extracted page.
This allows for downstream components and applications to better be able to
keep track of the original PDF page a portion of text comes from.

View File

@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG
from haystack.dataclasses import ByteStream
@pytest.mark.integration
@pytest.fixture
def pypdf_converter():
return PyPDFToDocument()
class TestPyPDFToDocument:
def test_init(self):
component = PyPDFToDocument()
assert component.converter_name == "default"
assert hasattr(component, "_converter")
def test_init(self, pypdf_converter):
assert pypdf_converter.converter_name == "default"
assert hasattr(pypdf_converter, "_converter")
def test_init_fail_nonexisting_converter(self):
with pytest.raises(ValueError):
PyPDFToDocument(converter_name="non_existing_converter")
def test_run(self, test_files_path):
@pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
"""
Test if the component runs correctly.
"""
paths = [test_files_path / "pdf" / "react_paper.pdf"]
converter = PyPDFToDocument()
output = converter.run(sources=paths)
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" in docs[0].content
assert "History" in docs[0].content
def test_run_with_meta(self, test_files_path):
@pytest.mark.integration
def test_page_breaks_added(self, test_files_path, pypdf_converter):
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3
def test_run_with_meta(self, test_files_path, pypdf_converter):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = PyPDFToDocument()
with patch("haystack.components.converters.pypdf.PdfReader"):
output = converter.run(
sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
output = pypdf_converter.run(
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
)
# check that the metadata from the bytestream is merged with that from the meta parameter
@ -43,38 +53,38 @@ class TestPyPDFToDocument:
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"
def test_run_error_handling(self, test_files_path, caplog):
def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.pdf"]
converter = PyPDFToDocument()
with caplog.at_level(logging.WARNING):
converter.run(sources=paths)
pypdf_converter.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text
def test_mixed_sources_run(self, test_files_path):
@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "pdf" / "react_paper.pdf"]
with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
paths.append(ByteStream(f.read()))
converter = PyPDFToDocument()
output = converter.run(sources=paths)
output = pypdf_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "ReAct" in docs[0].content
assert "ReAct" in docs[1].content
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
@pytest.mark.integration
def test_custom_converter(self, test_files_path):
"""
Test if the component correctly handles custom converters.
"""
from pypdf import PdfReader
paths = [test_files_path / "pdf" / "react_paper.pdf"]
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document: