From c0b67432e4da0af41a25cdbb74115c6cad1c6abf Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 18 Jan 2024 08:54:59 +0100 Subject: [PATCH] feat: Add page breaks to default PDF to Document converter (#6755) * Speedup tests for PyPDFToDocument * Added unit test and removed skipping of empty pages * add release note * Add back some integration marks --- haystack/components/converters/pypdf.py | 2 +- haystack/components/converters/utils.py | 2 +- .../pypdf-page-breaks-b6842d93f4c69185.yaml | 7 +++ .../converters/test_pypdf_to_document.py | 60 +++++++++++-------- 4 files changed, 44 insertions(+), 27 deletions(-) create mode 100644 releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 021029e05..0f0adff9e 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -31,7 +31,7 @@ class DefaultConverter: def convert(self, reader: "PdfReader") -> Document: """Extract text from the PDF and return a Document object with the text content.""" - text = "".join(page.extract_text() for page in reader.pages if page.extract_text()) + text = "\f".join(page.extract_text() for page in reader.pages) return Document(content=text) diff --git a/haystack/components/converters/utils.py b/haystack/components/converters/utils.py index 908a0a0bd..128e4a134 100644 --- a/haystack/components/converters/utils.py +++ b/haystack/components/converters/utils.py @@ -28,7 +28,7 @@ def normalize_metadata( makes sure to return a list of dictionaries of the correct length for the converter to use. :param meta: the meta input of the converter, as-is - :sources_count: the number of sources the converter received + :param sources_count: the number of sources the converter received :returns: a list of dictionaries of the make length as the sources list """ if meta is None: diff --git a/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml b/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml new file mode 100644 index 000000000..b57856dbb --- /dev/null +++ b/releasenotes/notes/pypdf-page-breaks-b6842d93f4c69185.yaml @@ -0,0 +1,7 @@ +--- +upgrade: + - | + Upgraded the default converter in PyPDFToDocument to insert page breaks "\f" + between each extracted page. + This allows for downstream components and applications to better be able to + keep track of the original PDF page a portion of text comes from. diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 8de0646b9..330ed9a31 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -7,35 +7,45 @@ from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REG from haystack.dataclasses import ByteStream -@pytest.mark.integration +@pytest.fixture +def pypdf_converter(): + return PyPDFToDocument() + + class TestPyPDFToDocument: - def test_init(self): - component = PyPDFToDocument() - assert component.converter_name == "default" - assert hasattr(component, "_converter") + def test_init(self, pypdf_converter): + assert pypdf_converter.converter_name == "default" + assert hasattr(pypdf_converter, "_converter") def test_init_fail_nonexisting_converter(self): with pytest.raises(ValueError): PyPDFToDocument(converter_name="non_existing_converter") - def test_run(self, test_files_path): + @pytest.mark.integration + def test_run(self, test_files_path, pypdf_converter): """ Test if the component runs correctly. """ - paths = [test_files_path / "pdf" / "react_paper.pdf"] - converter = PyPDFToDocument() - output = converter.run(sources=paths) + paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + output = pypdf_converter.run(sources=paths) docs = output["documents"] assert len(docs) == 1 - assert "ReAct" in docs[0].content + assert "History" in docs[0].content - def test_run_with_meta(self, test_files_path): + @pytest.mark.integration + def test_page_breaks_added(self, test_files_path, pypdf_converter): + paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + output = pypdf_converter.run(sources=paths) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].content.count("\f") == 3 + + def test_run_with_meta(self, test_files_path, pypdf_converter): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) - converter = PyPDFToDocument() with patch("haystack.components.converters.pypdf.PdfReader"): - output = converter.run( - sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"} + output = pypdf_converter.run( + sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"} ) # check that the metadata from the bytestream is merged with that from the meta parameter @@ -43,38 +53,38 @@ class TestPyPDFToDocument: assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" - def test_run_error_handling(self, test_files_path, caplog): + def test_run_error_handling(self, test_files_path, pypdf_converter, caplog): """ Test if the component correctly handles errors. """ paths = ["non_existing_file.pdf"] - converter = PyPDFToDocument() with caplog.at_level(logging.WARNING): - converter.run(sources=paths) + pypdf_converter.run(sources=paths) assert "Could not read non_existing_file.pdf" in caplog.text - def test_mixed_sources_run(self, test_files_path): + @pytest.mark.integration + def test_mixed_sources_run(self, test_files_path, pypdf_converter): """ Test if the component runs correctly when mixed sources are provided. """ - paths = [test_files_path / "pdf" / "react_paper.pdf"] - with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f: + paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + with open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb") as f: paths.append(ByteStream(f.read())) - converter = PyPDFToDocument() - output = converter.run(sources=paths) + output = pypdf_converter.run(sources=paths) docs = output["documents"] assert len(docs) == 2 - assert "ReAct" in docs[0].content - assert "ReAct" in docs[1].content + assert "History and standardization" in docs[0].content + assert "History and standardization" in docs[1].content + @pytest.mark.integration def test_custom_converter(self, test_files_path): """ Test if the component correctly handles custom converters. """ from pypdf import PdfReader - paths = [test_files_path / "pdf" / "react_paper.pdf"] + paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"] class MyCustomConverter: def convert(self, reader: PdfReader) -> Document: