haystack/test/nodes/test_document_merger.py
Stefano Fiorucci 1a60e21137
refactor: simplify Summarizer, add Document Merger (#3452)
* remove generate_single_summary

* update schemas

* remove unused import

* fix mypy

* fix mypy

* test: summarizer doesnt change content

* other test correction

* move test_summarizer_translation to test_extractor_translation

* fix test

* first try for doc merger

* reintroduce and deprecate generate_single_summary

* progress in document merger

* document merger!

* mypy, pylint fixes

* use generator

* added test that will fail in 1.12

* adapt to review

* extended deprecation docstring

* Update test/nodes/test_extractor_translation.py

* Update test/nodes/test_summarizer.py

* Update test/nodes/test_summarizer.py

* black

* documents fixture

Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
2022-11-03 16:04:53 +01:00

98 lines
2.8 KiB
Python

from haystack import Document
from haystack.nodes.other.document_merger import DocumentMerger
doc_dicts = [
{
"meta": {
"name": "name_1",
"year": "2020",
"month": "01",
"flat_field": 1,
"nested_field": {1: 2, "a": 5, "c": {"3": 3}, "d": "I will be dropped by the meta merge algorithm"},
},
"content": "text_1",
},
{
"meta": {
"name": "name_2",
"year": "2020",
"month": "02",
"flat_field": 1,
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
},
"content": "text_2",
},
{
"meta": {
"name": "name_3",
"year": "2020",
"month": "03",
"flat_field": 1,
"nested_field": {1: 2, "a": 7, "c": {"3": 3}},
},
"content": "text_3",
},
{
"meta": {
"name": "name_4",
"year": "2021",
"month": "01",
"flat_field": 1,
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
},
"content": "text_4",
},
{
"meta": {
"name": "name_5",
"year": "2021",
"month": "02",
"flat_field": 1,
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
},
"content": "text_5",
},
{
"meta": {
"name": "name_6",
"year": "2021",
"month": "03",
"flat_field": 1,
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
},
"content": "text_6",
},
]
documents = [Document.from_dict(doc) for doc in doc_dicts]
def test_document_merger_merge():
separator = "|"
dm = DocumentMerger(separator=separator)
merged_list = dm.merge(documents)
assert len(merged_list) == 1
assert merged_list[0].content == separator.join([doc["content"] for doc in doc_dicts])
assert merged_list[0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}
def test_document_merger_run():
separator = "|"
dm = DocumentMerger(separator=separator)
result = dm.run(documents)
assert len(result[0]["documents"]) == 1
assert result[0]["documents"][0].content == separator.join([doc["content"] for doc in doc_dicts])
assert result[0]["documents"][0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}
def test_document_merger_run_batch():
separator = "|"
dm = DocumentMerger(separator=separator)
batch_result = dm.run_batch([documents, documents])
assert len(batch_result[0]["documents"]) == 2
assert batch_result[0]["documents"][0][0].content == separator.join([doc["content"] for doc in doc_dicts])
assert batch_result[0]["documents"][0][0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}