2023-02-21 17:00:02 +01:00
|
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from haystack.schema import Document
|
|
|
|
from haystack.nodes import RouteDocuments
|
|
|
|
|
|
|
|
|
2023-04-18 15:18:13 +02:00
|
|
|
@pytest.fixture
|
|
|
|
def docs_diff_types():
|
|
|
|
return [
|
2023-02-21 17:00:02 +01:00
|
|
|
Document(content="text document", content_type="text"),
|
|
|
|
Document(
|
|
|
|
content=pd.DataFrame(columns=["col 1", "col 2"], data=[["row 1", "row 1"], ["row 2", "row 2"]]),
|
|
|
|
content_type="table",
|
|
|
|
),
|
2023-04-18 15:18:13 +02:00
|
|
|
Document(content="image/path", content_type="image"),
|
2023-02-21 17:00:02 +01:00
|
|
|
]
|
2023-04-18 15:18:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def docs_with_meta():
|
|
|
|
return [
|
|
|
|
Document(content="text document 1", content_type="text", meta={"meta_field": "test1"}),
|
|
|
|
Document(content="text document 2", content_type="text", meta={"meta_field": "test2"}),
|
|
|
|
Document(content="text document 3", content_type="text", meta={"meta_field": "test3"}),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_routedocuments_by_content_type(docs_diff_types):
|
2023-02-21 17:00:02 +01:00
|
|
|
route_documents = RouteDocuments()
|
2023-04-18 15:18:13 +02:00
|
|
|
result, _ = route_documents.run(documents=docs_diff_types)
|
|
|
|
assert route_documents.outgoing_edges == 2
|
2023-02-21 17:00:02 +01:00
|
|
|
assert len(result["output_1"]) == 1
|
|
|
|
assert len(result["output_2"]) == 1
|
2023-04-18 15:18:13 +02:00
|
|
|
assert "output_3" not in result
|
2023-02-21 17:00:02 +01:00
|
|
|
assert result["output_1"][0].content_type == "text"
|
|
|
|
assert result["output_2"][0].content_type == "table"
|
|
|
|
|
|
|
|
|
2023-04-18 15:18:13 +02:00
|
|
|
@pytest.mark.unit
|
|
|
|
def test_routedocuments_by_content_type_return_remaining(docs_diff_types):
|
|
|
|
route_documents = RouteDocuments(return_remaining=True)
|
|
|
|
result, _ = route_documents.run(documents=docs_diff_types)
|
|
|
|
assert route_documents.outgoing_edges == 3
|
|
|
|
assert len(result["output_1"]) == 1
|
|
|
|
assert len(result["output_2"]) == 1
|
|
|
|
assert len(result["output_3"]) == 1
|
|
|
|
assert result["output_1"][0].content_type == "text"
|
|
|
|
assert result["output_2"][0].content_type == "table"
|
|
|
|
assert result["output_3"][0].content_type == "image"
|
|
|
|
|
|
|
|
|
2023-02-21 17:00:02 +01:00
|
|
|
@pytest.mark.unit
|
|
|
|
def test_routedocuments_by_metafield(docs):
|
|
|
|
route_documents = RouteDocuments(split_by="meta_field", metadata_values=["test1", "test3", "test5"])
|
|
|
|
assert route_documents.outgoing_edges == 3
|
|
|
|
result, _ = route_documents.run(docs)
|
|
|
|
assert len(result["output_1"]) == 1
|
|
|
|
assert len(result["output_2"]) == 1
|
|
|
|
assert len(result["output_3"]) == 1
|
2023-04-18 15:18:13 +02:00
|
|
|
assert "output_4" not in result
|
2023-02-21 17:00:02 +01:00
|
|
|
assert result["output_1"][0].meta["meta_field"] == "test1"
|
|
|
|
assert result["output_2"][0].meta["meta_field"] == "test3"
|
|
|
|
assert result["output_3"][0].meta["meta_field"] == "test5"
|
2023-04-18 15:18:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_routedocuments_by_metafield_return_remaning(docs):
|
|
|
|
route_documents = RouteDocuments(
|
|
|
|
split_by="meta_field", metadata_values=["test1", "test3", "test5"], return_remaining=True
|
|
|
|
)
|
|
|
|
assert route_documents.outgoing_edges == 4
|
|
|
|
result, _ = route_documents.run(docs)
|
|
|
|
assert len(result["output_1"]) == 1
|
|
|
|
assert len(result["output_2"]) == 1
|
|
|
|
assert len(result["output_3"]) == 1
|
|
|
|
assert len(result["output_4"]) == 2
|
|
|
|
assert result["output_1"][0].meta["meta_field"] == "test1"
|
|
|
|
assert result["output_2"][0].meta["meta_field"] == "test3"
|
|
|
|
assert result["output_3"][0].meta["meta_field"] == "test5"
|
|
|
|
assert result["output_4"][0].meta["meta_field"] == "test2"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.unit
|
|
|
|
def test_routedocuments_by_metafield_list_of_lists(docs):
|
|
|
|
route_documents = RouteDocuments(
|
|
|
|
split_by="meta_field", metadata_values=[["test1", "test3"], "test5"], return_remaining=True
|
|
|
|
)
|
|
|
|
assert route_documents.outgoing_edges == 3
|
|
|
|
result, _ = route_documents.run(docs)
|
|
|
|
assert len(result["output_1"]) == 2
|
|
|
|
assert len(result["output_2"]) == 1
|
|
|
|
assert len(result["output_3"]) == 2
|
|
|
|
assert result["output_1"][0].meta["meta_field"] == "test1"
|
|
|
|
assert result["output_1"][1].meta["meta_field"] == "test3"
|
|
|
|
assert result["output_2"][0].meta["meta_field"] == "test5"
|
|
|
|
assert result["output_3"][0].meta["meta_field"] == "test2"
|