mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-25 16:15:35 +00:00
feat: Add page breaks to default PDF to Document converter (#6755)
* Speedup tests for PyPDFToDocument * Added unit test and removed skipping of empty pages * add release note * Add back some integration marks
This commit is contained in:
parent
eaec5bfe4a
commit
c0b67432e4
@ -31,7 +31,7 @@ class DefaultConverter:
|
||||
|
||||
def convert(self, reader: "PdfReader") -> Document:
|
||||
"""Extract text from the PDF and return a Document object with the text content."""
|
||||
text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
|
||||
text = "\f".join(page.extract_text() for page in reader.pages)
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ def normalize_metadata(
|
||||
makes sure to return a list of dictionaries of the correct length for the converter to use.
|
||||
|
||||
:param meta: the meta input of the converter, as-is
|
||||
:sources_count: the number of sources the converter received
|
||||
:param sources_count: the number of sources the converter received
|
||||
:returns: a list of dictionaries of the make length as the sources list
|
||||
"""
|
||||
if meta is None:
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
upgrade:
|
||||
- |
|
||||
Upgraded the default converter in PyPDFToDocument to insert page breaks "\f"
|
||||
between each extracted page.
|
||||
This allows for downstream components and applications to better be able to
|
||||
keep track of the original PDF page a portion of text comes from.
|
@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG
|
||||
from haystack.dataclasses import ByteStream
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.fixture
|
||||
def pypdf_converter():
|
||||
return PyPDFToDocument()
|
||||
|
||||
|
||||
class TestPyPDFToDocument:
|
||||
def test_init(self):
|
||||
component = PyPDFToDocument()
|
||||
assert component.converter_name == "default"
|
||||
assert hasattr(component, "_converter")
|
||||
def test_init(self, pypdf_converter):
|
||||
assert pypdf_converter.converter_name == "default"
|
||||
assert hasattr(pypdf_converter, "_converter")
|
||||
|
||||
def test_init_fail_nonexisting_converter(self):
|
||||
with pytest.raises(ValueError):
|
||||
PyPDFToDocument(converter_name="non_existing_converter")
|
||||
|
||||
def test_run(self, test_files_path):
|
||||
@pytest.mark.integration
|
||||
def test_run(self, test_files_path, pypdf_converter):
|
||||
"""
|
||||
Test if the component runs correctly.
|
||||
"""
|
||||
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
||||
converter = PyPDFToDocument()
|
||||
output = converter.run(sources=paths)
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "ReAct" in docs[0].content
|
||||
assert "History" in docs[0].content
|
||||
|
||||
def test_run_with_meta(self, test_files_path):
|
||||
@pytest.mark.integration
|
||||
def test_page_breaks_added(self, test_files_path, pypdf_converter):
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert docs[0].content.count("\f") == 3
|
||||
|
||||
def test_run_with_meta(self, test_files_path, pypdf_converter):
|
||||
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
||||
|
||||
converter = PyPDFToDocument()
|
||||
with patch("haystack.components.converters.pypdf.PdfReader"):
|
||||
output = converter.run(
|
||||
sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
|
||||
output = pypdf_converter.run(
|
||||
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
||||
)
|
||||
|
||||
# check that the metadata from the bytestream is merged with that from the meta parameter
|
||||
@ -43,38 +53,38 @@ class TestPyPDFToDocument:
|
||||
assert output["documents"][0].meta["language"] == "it"
|
||||
assert output["documents"][1].meta["language"] == "it"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, caplog):
|
||||
def test_run_error_handling(self, test_files_path, pypdf_converter, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
paths = ["non_existing_file.pdf"]
|
||||
converter = PyPDFToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter.run(sources=paths)
|
||||
pypdf_converter.run(sources=paths)
|
||||
assert "Could not read non_existing_file.pdf" in caplog.text
|
||||
|
||||
def test_mixed_sources_run(self, test_files_path):
|
||||
@pytest.mark.integration
|
||||
def test_mixed_sources_run(self, test_files_path, pypdf_converter):
|
||||
"""
|
||||
Test if the component runs correctly when mixed sources are provided.
|
||||
"""
|
||||
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
||||
with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f:
|
||||
paths.append(ByteStream(f.read()))
|
||||
|
||||
converter = PyPDFToDocument()
|
||||
output = converter.run(sources=paths)
|
||||
output = pypdf_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 2
|
||||
assert "ReAct" in docs[0].content
|
||||
assert "ReAct" in docs[1].content
|
||||
assert "History and standardization" in docs[0].content
|
||||
assert "History and standardization" in docs[1].content
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_custom_converter(self, test_files_path):
|
||||
"""
|
||||
Test if the component correctly handles custom converters.
|
||||
"""
|
||||
from pypdf import PdfReader
|
||||
|
||||
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
||||
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
|
||||
class MyCustomConverter:
|
||||
def convert(self, reader: PdfReader) -> Document:
|
||||
|
Loading…
x
Reference in New Issue
Block a user