haystack/test/components/converters/test_multi_file_converter.py
David S. Batista 2f7cb9e959
!fix: FileTypeRouter raising FileNotFound in a consistently manner (#9710)
* adding raise_on_failure and warning

* adding release notes

* reverting, adding wrongly removed file

* FileNotFoundError is raised both with and without metadata passed

* reverting to raise_on_failure

* Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>

* adding warning and updating tests

* adding warning and updating tests

* updating docstring and warning

* updating release notes

* adding extra output key 'failed' and updating tests

* adding missed test file

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* test fixes

* remove duplicated tests

* updating log message

* Fix multi file converter

* updating release notes

* Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* improve relnote

* fixing typing

---------

Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com>
Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
2025-08-19 11:12:14 +02:00

146 lines
5.8 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from haystack import Document, Pipeline
from haystack.components.converters.multi_file_converter import MultiFileConverter
from haystack.core.component.component import Component
from haystack.core.pipeline.base import component_from_dict, component_to_dict
from haystack.dataclasses import ByteStream
@pytest.fixture
def converter():
converter = MultiFileConverter()
converter.warm_up()
return converter
class TestMultiFileConverter:
def test_init_default_params(self, converter):
"""Test initialization with default parameters"""
assert converter.encoding == "utf-8"
assert converter.json_content_key == "content"
assert isinstance(converter, Component)
def test_init_custom_params(self):
"""Test initialization with custom parameters"""
converter = MultiFileConverter(encoding="latin-1", json_content_key="text")
assert converter.encoding == "latin-1"
assert converter.json_content_key == "text"
def test_to_dict(self, converter):
"""Test serialization to dictionary"""
data = component_to_dict(converter, "converter")
assert data == {
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {"encoding": "utf-8", "json_content_key": "content"},
}
def test_from_dict(self):
"""Test deserialization from dictionary"""
data = {
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {"encoding": "latin-1", "json_content_key": "text"},
}
conv = component_from_dict(MultiFileConverter, data, "converter")
assert conv.encoding == "latin-1"
assert conv.json_content_key == "text"
@pytest.mark.parametrize(
"suffix,file_path",
[
("csv", "csv/sample_1.csv"),
("docx", "docx/sample_docx.docx"),
("html", "html/what_is_haystack.html"),
("json", "json/json_conversion_testfile.json"),
("md", "markdown/sample.md"),
("pdf", "pdf/sample_pdf_1.pdf"),
("pptx", "pptx/sample_pptx.pptx"),
("txt", "txt/doc_1.txt"),
("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
],
)
@pytest.mark.integration
def test_run(self, test_files_path, converter, suffix, file_path):
unclassified_bytestream = ByteStream(b"unclassified content")
unclassified_bytestream.meta["content_type"] = "unknown_type"
paths = [test_files_path / file_path, unclassified_bytestream]
output = converter.run(sources=paths)
docs = output["documents"]
unclassified = output["unclassified"]
assert len(docs) == 1
assert isinstance(docs[0], Document)
assert docs[0].content is not None
assert docs[0].meta["file_path"].endswith(suffix)
assert len(unclassified) == 1
assert isinstance(unclassified[0], ByteStream)
assert unclassified[0].meta["content_type"] == "unknown_type"
def test_run_with_meta(self, test_files_path, converter):
"""Test conversion with metadata"""
paths = [test_files_path / "txt" / "doc_1.txt"]
meta = {"language": "en", "author": "test"}
output = converter.run(sources=paths, meta=meta)
docs = output["documents"]
assert docs[0].meta["language"] == "en"
assert docs[0].meta["author"] == "test"
def test_run_with_bytestream(self, converter):
"""Test converting ByteStream input"""
bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"})
output = converter.run(sources=[bytestream])
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content == "test content"
assert docs[0].meta["file_path"] == "test.txt"
def test_run_error_handling(self, test_files_path, converter, caplog):
"""Test error handling for non-existent files"""
paths = [test_files_path / "non_existent.txt"]
with caplog.at_level("WARNING"):
output = converter.run(sources=paths)
assert "File not found" in caplog.text
assert len(output["failed"]) == 1
@pytest.mark.integration
def test_run_all_file_types(self, test_files_path, converter):
"""Test converting all supported file types in parallel"""
paths = [
test_files_path / "csv" / "sample_1.csv",
test_files_path / "docx" / "sample_docx.docx",
test_files_path / "html" / "what_is_haystack.html",
test_files_path / "json" / "json_conversion_testfile.json",
test_files_path / "markdown" / "sample.md",
test_files_path / "txt" / "doc_1.txt",
test_files_path / "pdf" / "sample_pdf_1.pdf",
test_files_path / "pptx" / "sample_pptx.pptx",
test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx",
]
output = converter.run(sources=paths)
docs = output["documents"]
# Verify we got a document for each file
assert len(docs) == len(paths)
assert all(isinstance(doc, Document) for doc in docs)
@pytest.mark.integration
def test_run_in_pipeline(self, test_files_path, converter):
pipeline = Pipeline(max_runs_per_component=1)
pipeline.add_component("converter", converter)
paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"]
output = pipeline.run(data={"sources": paths})
docs = output["converter"]["documents"]
assert len(docs) == 2
assert all(isinstance(doc, Document) for doc in docs)
assert all(doc.content is not None for doc in docs)