mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-27 19:00:35 +00:00

* remove generate_single_summary * update schemas * remove unused import * fix mypy * fix mypy * test: summarizer doesnt change content * other test correction * move test_summarizer_translation to test_extractor_translation * fix test * first try for doc merger * reintroduce and deprecate generate_single_summary * progress in document merger * document merger! * mypy, pylint fixes * use generator * added test that will fail in 1.12 * adapt to review * extended deprecation docstring * Update test/nodes/test_extractor_translation.py * Update test/nodes/test_summarizer.py * Update test/nodes/test_summarizer.py * black * documents fixture Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
98 lines
2.8 KiB
Python
98 lines
2.8 KiB
Python
from haystack import Document
|
|
from haystack.nodes.other.document_merger import DocumentMerger
|
|
|
|
doc_dicts = [
|
|
{
|
|
"meta": {
|
|
"name": "name_1",
|
|
"year": "2020",
|
|
"month": "01",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 5, "c": {"3": 3}, "d": "I will be dropped by the meta merge algorithm"},
|
|
},
|
|
"content": "text_1",
|
|
},
|
|
{
|
|
"meta": {
|
|
"name": "name_2",
|
|
"year": "2020",
|
|
"month": "02",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
|
|
},
|
|
"content": "text_2",
|
|
},
|
|
{
|
|
"meta": {
|
|
"name": "name_3",
|
|
"year": "2020",
|
|
"month": "03",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 7, "c": {"3": 3}},
|
|
},
|
|
"content": "text_3",
|
|
},
|
|
{
|
|
"meta": {
|
|
"name": "name_4",
|
|
"year": "2021",
|
|
"month": "01",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
|
|
},
|
|
"content": "text_4",
|
|
},
|
|
{
|
|
"meta": {
|
|
"name": "name_5",
|
|
"year": "2021",
|
|
"month": "02",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
|
|
},
|
|
"content": "text_5",
|
|
},
|
|
{
|
|
"meta": {
|
|
"name": "name_6",
|
|
"year": "2021",
|
|
"month": "03",
|
|
"flat_field": 1,
|
|
"nested_field": {1: 2, "a": 5, "c": {"3": 3}},
|
|
},
|
|
"content": "text_6",
|
|
},
|
|
]
|
|
|
|
documents = [Document.from_dict(doc) for doc in doc_dicts]
|
|
|
|
|
|
def test_document_merger_merge():
|
|
separator = "|"
|
|
dm = DocumentMerger(separator=separator)
|
|
merged_list = dm.merge(documents)
|
|
|
|
assert len(merged_list) == 1
|
|
assert merged_list[0].content == separator.join([doc["content"] for doc in doc_dicts])
|
|
assert merged_list[0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}
|
|
|
|
|
|
def test_document_merger_run():
|
|
separator = "|"
|
|
dm = DocumentMerger(separator=separator)
|
|
result = dm.run(documents)
|
|
|
|
assert len(result[0]["documents"]) == 1
|
|
assert result[0]["documents"][0].content == separator.join([doc["content"] for doc in doc_dicts])
|
|
assert result[0]["documents"][0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}
|
|
|
|
|
|
def test_document_merger_run_batch():
|
|
separator = "|"
|
|
dm = DocumentMerger(separator=separator)
|
|
batch_result = dm.run_batch([documents, documents])
|
|
|
|
assert len(batch_result[0]["documents"]) == 2
|
|
assert batch_result[0]["documents"][0][0].content == separator.join([doc["content"] for doc in doc_dicts])
|
|
assert batch_result[0]["documents"][0][0].meta == {"flat_field": 1, "nested_field": {1: 2, "c": {"3": 3}}}
|