mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-13 07:47:26 +00:00
* adding raise_on_failure and warning * adding release notes * reverting, adding wrongly removed file * FileNotFoundError is raised both with and without metadata passed * reverting to raise_on_failure * Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> * adding warning and updating tests * adding warning and updating tests * updating docstring and warning * updating release notes * adding extra output key 'failed' and updating tests * adding missed test file * Update haystack/components/routers/file_type_router.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/routers/file_type_router.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * test fixes * remove duplicated tests * updating log message * Fix multi file converter * updating release notes * Update releasenotes/notes/fix-filetype-router-inconsistencies-b22a3af00059f953.yaml Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * improve relnote * fixing typing --------- Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
146 lines
5.8 KiB
Python
146 lines
5.8 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import pytest
|
|
|
|
from haystack import Document, Pipeline
|
|
from haystack.components.converters.multi_file_converter import MultiFileConverter
|
|
from haystack.core.component.component import Component
|
|
from haystack.core.pipeline.base import component_from_dict, component_to_dict
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
|
|
@pytest.fixture
|
|
def converter():
|
|
converter = MultiFileConverter()
|
|
converter.warm_up()
|
|
return converter
|
|
|
|
|
|
class TestMultiFileConverter:
|
|
def test_init_default_params(self, converter):
|
|
"""Test initialization with default parameters"""
|
|
assert converter.encoding == "utf-8"
|
|
assert converter.json_content_key == "content"
|
|
assert isinstance(converter, Component)
|
|
|
|
def test_init_custom_params(self):
|
|
"""Test initialization with custom parameters"""
|
|
converter = MultiFileConverter(encoding="latin-1", json_content_key="text")
|
|
assert converter.encoding == "latin-1"
|
|
assert converter.json_content_key == "text"
|
|
|
|
def test_to_dict(self, converter):
|
|
"""Test serialization to dictionary"""
|
|
data = component_to_dict(converter, "converter")
|
|
assert data == {
|
|
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
|
|
"init_parameters": {"encoding": "utf-8", "json_content_key": "content"},
|
|
}
|
|
|
|
def test_from_dict(self):
|
|
"""Test deserialization from dictionary"""
|
|
data = {
|
|
"type": "haystack.components.converters.multi_file_converter.MultiFileConverter",
|
|
"init_parameters": {"encoding": "latin-1", "json_content_key": "text"},
|
|
}
|
|
conv = component_from_dict(MultiFileConverter, data, "converter")
|
|
assert conv.encoding == "latin-1"
|
|
assert conv.json_content_key == "text"
|
|
|
|
@pytest.mark.parametrize(
|
|
"suffix,file_path",
|
|
[
|
|
("csv", "csv/sample_1.csv"),
|
|
("docx", "docx/sample_docx.docx"),
|
|
("html", "html/what_is_haystack.html"),
|
|
("json", "json/json_conversion_testfile.json"),
|
|
("md", "markdown/sample.md"),
|
|
("pdf", "pdf/sample_pdf_1.pdf"),
|
|
("pptx", "pptx/sample_pptx.pptx"),
|
|
("txt", "txt/doc_1.txt"),
|
|
("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
|
|
],
|
|
)
|
|
@pytest.mark.integration
|
|
def test_run(self, test_files_path, converter, suffix, file_path):
|
|
unclassified_bytestream = ByteStream(b"unclassified content")
|
|
unclassified_bytestream.meta["content_type"] = "unknown_type"
|
|
|
|
paths = [test_files_path / file_path, unclassified_bytestream]
|
|
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
unclassified = output["unclassified"]
|
|
|
|
assert len(docs) == 1
|
|
assert isinstance(docs[0], Document)
|
|
assert docs[0].content is not None
|
|
assert docs[0].meta["file_path"].endswith(suffix)
|
|
|
|
assert len(unclassified) == 1
|
|
assert isinstance(unclassified[0], ByteStream)
|
|
assert unclassified[0].meta["content_type"] == "unknown_type"
|
|
|
|
def test_run_with_meta(self, test_files_path, converter):
|
|
"""Test conversion with metadata"""
|
|
paths = [test_files_path / "txt" / "doc_1.txt"]
|
|
meta = {"language": "en", "author": "test"}
|
|
output = converter.run(sources=paths, meta=meta)
|
|
docs = output["documents"]
|
|
assert docs[0].meta["language"] == "en"
|
|
assert docs[0].meta["author"] == "test"
|
|
|
|
def test_run_with_bytestream(self, converter):
|
|
"""Test converting ByteStream input"""
|
|
bytestream = ByteStream(data=b"test content", mime_type="text/plain", meta={"file_path": "test.txt"})
|
|
output = converter.run(sources=[bytestream])
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert docs[0].content == "test content"
|
|
assert docs[0].meta["file_path"] == "test.txt"
|
|
|
|
def test_run_error_handling(self, test_files_path, converter, caplog):
|
|
"""Test error handling for non-existent files"""
|
|
paths = [test_files_path / "non_existent.txt"]
|
|
with caplog.at_level("WARNING"):
|
|
output = converter.run(sources=paths)
|
|
assert "File not found" in caplog.text
|
|
assert len(output["failed"]) == 1
|
|
|
|
@pytest.mark.integration
|
|
def test_run_all_file_types(self, test_files_path, converter):
|
|
"""Test converting all supported file types in parallel"""
|
|
paths = [
|
|
test_files_path / "csv" / "sample_1.csv",
|
|
test_files_path / "docx" / "sample_docx.docx",
|
|
test_files_path / "html" / "what_is_haystack.html",
|
|
test_files_path / "json" / "json_conversion_testfile.json",
|
|
test_files_path / "markdown" / "sample.md",
|
|
test_files_path / "txt" / "doc_1.txt",
|
|
test_files_path / "pdf" / "sample_pdf_1.pdf",
|
|
test_files_path / "pptx" / "sample_pptx.pptx",
|
|
test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx",
|
|
]
|
|
output = converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
|
|
# Verify we got a document for each file
|
|
assert len(docs) == len(paths)
|
|
assert all(isinstance(doc, Document) for doc in docs)
|
|
|
|
@pytest.mark.integration
|
|
def test_run_in_pipeline(self, test_files_path, converter):
|
|
pipeline = Pipeline(max_runs_per_component=1)
|
|
pipeline.add_component("converter", converter)
|
|
|
|
paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
|
|
|
output = pipeline.run(data={"sources": paths})
|
|
docs = output["converter"]["documents"]
|
|
|
|
assert len(docs) == 2
|
|
assert all(isinstance(doc, Document) for doc in docs)
|
|
assert all(doc.content is not None for doc in docs)
|