feat: Add page breaks to default PDF to Document converter (#6755)

* Speedup tests for PyPDFToDocument

* Added unit test and removed skipping of empty pages

* add release note

* Add back some integration marks
This commit is contained in:
Sebastian Husch Lee 2024-01-18 08:54:59 +01:00 committed by GitHub
parent eaec5bfe4a
commit c0b67432e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 44 additions and 27 deletions

View File

@ -31,7 +31,7 @@ class DefaultConverter:
def convert(self, reader: "PdfReader") -> Document: def convert(self, reader: "PdfReader") -> Document:
"""Extract text from the PDF and return a Document object with the text content.""" """Extract text from the PDF and return a Document object with the text content."""
text = "".join(page.extract_text() for page in reader.pages if page.extract_text()) text = "\f".join(page.extract_text() for page in reader.pages)
return Document(content=text) return Document(content=text)

View File

@ -28,7 +28,7 @@ def normalize_metadata(
makes sure to return a list of dictionaries of the correct length for the converter to use. makes sure to return a list of dictionaries of the correct length for the converter to use.
:param meta: the meta input of the converter, as-is :param meta: the meta input of the converter, as-is
:sources_count: the number of sources the converter received :param sources_count: the number of sources the converter received
:returns: a list of dictionaries of the make length as the sources list :returns: a list of dictionaries of the make length as the sources list
""" """
if meta is None: if meta is None:

View File

@ -0,0 +1,7 @@
---
upgrade:
- |
Upgraded the default converter in PyPDFToDocument to insert page breaks "\f"
between each extracted page.
This allows for downstream components and applications to better be able to
keep track of the original PDF page a portion of text comes from.

View File

@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG
from haystack.dataclasses import ByteStream from haystack.dataclasses import ByteStream
@pytest.mark.integration @pytest.fixture
def pypdf_converter():
return PyPDFToDocument()
class TestPyPDFToDocument: class TestPyPDFToDocument:
def test_init(self): def test_init(self, pypdf_converter):
component = PyPDFToDocument() assert pypdf_converter.converter_name == "default"
assert component.converter_name == "default" assert hasattr(pypdf_converter, "_converter")
assert hasattr(component, "_converter")
def test_init_fail_nonexisting_converter(self): def test_init_fail_nonexisting_converter(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):
PyPDFToDocument(converter_name="non_existing_converter") PyPDFToDocument(converter_name="non_existing_converter")
def test_run(self, test_files_path): @pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
""" """
Test if the component runs correctly. Test if the component runs correctly.
""" """
paths = [test_files_path / "pdf" / "react_paper.pdf"] paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
converter = PyPDFToDocument() output = pypdf_converter.run(sources=paths)
output = converter.run(sources=paths)
docs = output["documents"] docs = output["documents"]
assert len(docs) == 1 assert len(docs) == 1
assert "ReAct" in docs[0].content assert "History" in docs[0].content
def test_run_with_meta(self, test_files_path): @pytest.mark.integration
def test_page_breaks_added(self, test_files_path, pypdf_converter):
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pypdf_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3
def test_run_with_meta(self, test_files_path, pypdf_converter):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
converter = PyPDFToDocument()
with patch("haystack.components.converters.pypdf.PdfReader"): with patch("haystack.components.converters.pypdf.PdfReader"):
output = converter.run( output = pypdf_converter.run(
sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"} sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
) )
# check that the metadata from the bytestream is merged with that from the meta parameter # check that the metadata from the bytestream is merged with that from the meta parameter
@ -43,38 +53,38 @@ class TestPyPDFToDocument:
assert output["documents"][0].meta["language"] == "it" assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it"
def test_run_error_handling(self, test_files_path, caplog): def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
""" """
Test if the component correctly handles errors. Test if the component correctly handles errors.
""" """
paths = ["non_existing_file.pdf"] paths = ["non_existing_file.pdf"]
converter = PyPDFToDocument()
with caplog.at_level(logging.WARNING): with caplog.at_level(logging.WARNING):
converter.run(sources=paths) pypdf_converter.run(sources=paths)
assert "Could not read non_existing_file.pdf" in caplog.text assert "Could not read non_existing_file.pdf" in caplog.text
def test_mixed_sources_run(self, test_files_path): @pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
""" """
Test if the component runs correctly when mixed sources are provided. Test if the component runs correctly when mixed sources are provided.
""" """
paths = [test_files_path / "pdf" / "react_paper.pdf"] paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f: with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
paths.append(ByteStream(f.read())) paths.append(ByteStream(f.read()))
converter = PyPDFToDocument() output = pypdf_converter.run(sources=paths)
output = converter.run(sources=paths)
docs = output["documents"] docs = output["documents"]
assert len(docs) == 2 assert len(docs) == 2
assert "ReAct" in docs[0].content assert "History and standardization" in docs[0].content
assert "ReAct" in docs[1].content assert "History and standardization" in docs[1].content
@pytest.mark.integration
def test_custom_converter(self, test_files_path): def test_custom_converter(self, test_files_path):
""" """
Test if the component correctly handles custom converters. Test if the component correctly handles custom converters.
""" """
from pypdf import PdfReader from pypdf import PdfReader
paths = [test_files_path / "pdf" / "react_paper.pdf"] paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
class MyCustomConverter: class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document: def convert(self, reader: PdfReader) -> Document: