mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-24 09:20:13 +00:00

* standardize converters inputs: first draft * fix precommit * fix precommit 2 * fix precommit 3 * add default for optional param * rm leftover * install boilerpy in linting workflow * add boilerpy3 to the core dependencies * add reno * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note * better Tika docstring * rm boilerpy from linting * leftover * md link brackets * feat: Converters - allow passing `meta` in the `run` method (#6554) * first impl for html * progressing on other components * fix test * add tests - run with meta * release note * reintroduce patches wrongly deleted * add patch in test * fix tika test * Update haystack/components/converters/azure.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update releasenotes/notes/converters-standardize-inputs-ed2ba9c97b762974.yaml Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * simplify test --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
89 lines
3.2 KiB
Python
89 lines
3.2 KiB
Python
import logging
|
|
from unittest.mock import patch
|
|
import pytest
|
|
|
|
from haystack import Document
|
|
from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
|
|
@pytest.mark.integration
|
|
class TestPyPDFToDocument:
|
|
def test_init(self):
|
|
component = PyPDFToDocument()
|
|
assert component.converter_name == "default"
|
|
assert hasattr(component, "_converter")
|
|
|
|
def test_init_fail_nonexisting_converter(self):
|
|
with pytest.raises(ValueError):
|
|
PyPDFToDocument(converter_name="non_existing_converter")
|
|
|
|
def test_run(self, test_files_path):
|
|
"""
|
|
Test if the component runs correctly.
|
|
"""
|
|
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
|
converter = PyPDFToDocument()
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "ReAct" in docs[0].content
|
|
|
|
def test_run_with_meta(self):
|
|
bytestream = ByteStream(data=b"test", metadata={"author": "test_author", "language": "en"})
|
|
|
|
converter = PyPDFToDocument()
|
|
with patch("haystack.components.converters.pypdf.PdfReader"):
|
|
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
|
|
|
|
document = output["documents"][0]
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
|
assert document.meta == {"author": "test_author", "language": "it"}
|
|
|
|
def test_run_error_handling(self, test_files_path, caplog):
|
|
"""
|
|
Test if the component correctly handles errors.
|
|
"""
|
|
paths = ["non_existing_file.pdf"]
|
|
converter = PyPDFToDocument()
|
|
with caplog.at_level(logging.WARNING):
|
|
converter.run(sources=paths)
|
|
assert "Could not read non_existing_file.pdf" in caplog.text
|
|
|
|
def test_mixed_sources_run(self, test_files_path):
|
|
"""
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
"""
|
|
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
|
with open(test_files_path / "pdf" / "react_paper.pdf", "rb") as f:
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
converter = PyPDFToDocument()
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 2
|
|
assert "ReAct" in docs[0].content
|
|
assert "ReAct" in docs[1].content
|
|
|
|
def test_custom_converter(self, test_files_path):
|
|
"""
|
|
Test if the component correctly handles custom converters.
|
|
"""
|
|
from pypdf import PdfReader
|
|
|
|
paths = [test_files_path / "pdf" / "react_paper.pdf"]
|
|
|
|
class MyCustomConverter:
|
|
def convert(self, reader: PdfReader) -> Document:
|
|
return Document(content="I don't care about converting given pdfs, I always return this")
|
|
|
|
CONVERTERS_REGISTRY["custom"] = MyCustomConverter()
|
|
|
|
converter = PyPDFToDocument(converter_name="custom")
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "ReAct" not in docs[0].content
|
|
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|