haystack/test/components/converters/test_pypdf_to_document.py

import logging
from unittest.mock import patch
import pytest

from haystack import Document
from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
from haystack.dataclasses import ByteStream


@pytest.mark.integration
class TestPyPDFToDocument:
    def test_init(self):
        component = PyPDFToDocument()
        assert component.converter_name == "default"
        assert hasattr(component, "_converter")

    def test_init_fail_nonexisting_converter(self):
        with pytest.raises(ValueError):
            PyPDFToDocument(converter_name="non_existing_converter")

    def test_run(self, test_files_path):
        """
        Test if the component runs correctly.
        """
        paths = [test_files_path / "pdf" / "react_paper.pdf"]
        converter = PyPDFToDocument()
        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "ReAct" in docs[0].content

    def test_run_with_meta(self, test_files_path):
        bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

        converter = PyPDFToDocument()
        with patch("haystack.components.converters.pypdf.PdfReader"):
            output = converter.run(
                sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
            )

        # check that the metadata from the bytestream is merged with that from the meta parameter
        assert output["documents"][0].meta["author"] == "test_author"
        assert output["documents"][0].meta["language"] == "it"
        assert output["documents"][1].meta["language"] == "it"

    def test_run_error_handling(self, test_files_path, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.pdf"]
        converter = PyPDFToDocument()
        with caplog.at_level(logging.WARNING):
            converter.run(sources=paths)
            assert "Could not read non_existing_file.pdf" in caplog.text

    def test_mixed_sources_run(self, test_files_path):
        """
        Test if the component runs correctly when mixed sources are provided.
        """
        paths = [test_files_path / "pdf" / "react_paper.pdf"]
        with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
            paths.append(ByteStream(f.read()))

        converter = PyPDFToDocument()
        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 2
        assert "ReAct" in docs[0].content
        assert "ReAct" in docs[1].content

    def test_custom_converter(self, test_files_path):
        """
        Test if the component correctly handles custom converters.
        """
        from pypdf import PdfReader

        paths = [test_files_path / "pdf" / "react_paper.pdf"]

        class MyCustomConverter:
            def convert(self, reader: PdfReader) -> Document:
                return Document(content="I don't care about converting given pdfs, I always return this")

        CONVERTERS_REGISTRY["custom"] = MyCustomConverter()

        converter = PyPDFToDocument(converter_name="custom")
        output = converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "ReAct" not in docs[0].content
        assert "I don't care about converting given pdfs, I always return this" in docs[0].content
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`import logging`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00			`from unittest.mock import patch`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`import pytest`

Fix all tests 2023-11-24 14:48:43 +01:00			`from haystack import Document`
			`from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY`
			`from haystack.dataclasses import ByteStream`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00

remove unit marker (#6450) 2023-11-29 19:24:25 +01:00			`@pytest.mark.integration`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`class TestPyPDFToDocument:`
fix!: make `PyPDFToDocument` JSON-serializable (#6396) * add registry * release not * add checks * rm superflous check * fix typo * rm print :-) 2023-11-23 15:37:20 +01:00			`def test_init(self):`
			`component = PyPDFToDocument()`
			`assert component.converter_name == "default"`
			`assert hasattr(component, "_converter")`

			`def test_init_fail_nonexisting_converter(self):`
			`with pytest.raises(ValueError):`
			`PyPDFToDocument(converter_name="non_existing_converter")`

Fix all tests 2023-11-24 14:48:43 +01:00			`def test_run(self, test_files_path):`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`"""`
			`Test if the component runs correctly.`
			`"""`
Fix all tests 2023-11-24 14:48:43 +01:00			`paths = [test_files_path / "pdf" / "react_paper.pdf"]`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`converter = PyPDFToDocument()`
feat: Update `PyPDFToDocument` to process `ByteStream` inputs (#6021) * Update PyPDF converter * Add mixed source unit test * Update haystack/preview/components/file_converters/pypdf.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-10-11 10:52:08 +02:00			`output = converter.run(sources=paths)`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`docs = output["documents"]`
			`assert len(docs) == 1`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`assert "ReAct" in docs[0].content`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00
feat: support single metadata dictionary in `PyPDFToDocument` (#6615) * support single metadata dict in pypdf2document * improve tests * tests * remove line 2023-12-22 13:13:11 +00:00			`def test_run_with_meta(self, test_files_path):`
changed metadata to meta (#6605) 2023-12-21 17:09:58 +05:30			`bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00
			`converter = PyPDFToDocument()`
			`with patch("haystack.components.converters.pypdf.PdfReader"):`
feat: support single metadata dictionary in `PyPDFToDocument` (#6615) * support single metadata dict in pypdf2document * improve tests * tests * remove line 2023-12-22 13:13:11 +00:00			`output = converter.run(`
			`sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}`
			`)`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00
			`# check that the metadata from the bytestream is merged with that from the meta parameter`
feat: support single metadata dictionary in `PyPDFToDocument` (#6615) * support single metadata dict in pypdf2document * improve tests * tests * remove line 2023-12-22 13:13:11 +00:00			`assert output["documents"][0].meta["author"] == "test_author"`
			`assert output["documents"][0].meta["language"] == "it"`
			`assert output["documents"][1].meta["language"] == "it"`
refactor!: Converters - standardize inputs (#6540) * standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-12-15 16:41:35 +01:00
Fix all tests 2023-11-24 14:48:43 +01:00			`def test_run_error_handling(self, test_files_path, caplog):`
feat: Add PyPDFToDocument component (2.0) (#5850) * Initial PyPDFToDocument implementation * Remove progress bar * Add release note * Minor fix * import check and dependency --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-21 11:52:26 +02:00			`"""`
			`Test if the component correctly handles errors.`
			`"""`
			`paths = ["non_existing_file.pdf"]`
			`converter = PyPDFToDocument()`
			`with caplog.at_level(logging.WARNING):`
feat: Update `PyPDFToDocument` to process `ByteStream` inputs (#6021) * Update PyPDF converter * Add mixed source unit test * Update haystack/preview/components/file_converters/pypdf.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-10-11 10:52:08 +02:00			`converter.run(sources=paths)`
			`assert "Could not read non_existing_file.pdf" in caplog.text`

Fix all tests 2023-11-24 14:48:43 +01:00			`def test_mixed_sources_run(self, test_files_path):`
feat: Update `PyPDFToDocument` to process `ByteStream` inputs (#6021) * Update PyPDF converter * Add mixed source unit test * Update haystack/preview/components/file_converters/pypdf.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-10-11 10:52:08 +02:00			`"""`
			`Test if the component runs correctly when mixed sources are provided.`
			`"""`
Fix all tests 2023-11-24 14:48:43 +01:00			`paths = [test_files_path / "pdf" / "react_paper.pdf"]`
			`with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:`
feat: Update `PyPDFToDocument` to process `ByteStream` inputs (#6021) * Update PyPDF converter * Add mixed source unit test * Update haystack/preview/components/file_converters/pypdf.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-10-11 10:52:08 +02:00			`paths.append(ByteStream(f.read()))`

			`converter = PyPDFToDocument()`
			`output = converter.run(sources=paths)`
			`docs = output["documents"]`
			`assert len(docs) == 2`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`assert "ReAct" in docs[0].content`
			`assert "ReAct" in docs[1].content`
feat: Add custom conversion callable to PyPDFToDocument - Haystack 2.x (#6258) * Allow user specified converter hook * Add a release note * More unit tests * PR review - Massi, use protocol as converter 2023-11-09 17:35:33 +01:00
Fix all tests 2023-11-24 14:48:43 +01:00			`def test_custom_converter(self, test_files_path):`
feat: Add custom conversion callable to PyPDFToDocument - Haystack 2.x (#6258) * Allow user specified converter hook * Add a release note * More unit tests * PR review - Massi, use protocol as converter 2023-11-09 17:35:33 +01:00			`"""`
			`Test if the component correctly handles custom converters.`
			`"""`
remove unit marker (#6450) 2023-11-29 19:24:25 +01:00			`from pypdf import PdfReader`

Fix all tests 2023-11-24 14:48:43 +01:00			`paths = [test_files_path / "pdf" / "react_paper.pdf"]`
feat: Add custom conversion callable to PyPDFToDocument - Haystack 2.x (#6258) * Allow user specified converter hook * Add a release note * More unit tests * PR review - Massi, use protocol as converter 2023-11-09 17:35:33 +01:00
			`class MyCustomConverter:`
			`def convert(self, reader: PdfReader) -> Document:`
			`return Document(content="I don't care about converting given pdfs, I always return this")`

fix!: make `PyPDFToDocument` JSON-serializable (#6396) * add registry * release not * add checks * rm superflous check * fix typo * rm print :-) 2023-11-23 15:37:20 +01:00			`CONVERTERS_REGISTRY["custom"] = MyCustomConverter()`

			`converter = PyPDFToDocument(converter_name="custom")`
feat: Add custom conversion callable to PyPDFToDocument - Haystack 2.x (#6258) * Allow user specified converter hook * Add a release note * More unit tests * PR review - Massi, use protocol as converter 2023-11-09 17:35:33 +01:00			`output = converter.run(sources=paths)`
			`docs = output["documents"]`
			`assert len(docs) == 1`
			`assert "ReAct" not in docs[0].content`
			`assert "I don't care about converting given pdfs, I always return this" in docs[0].content`