mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-29 16:36:34 +00:00
feat: adding metadata grouper component (#8512)
* initial import * making tests more readable; adding docstring * adding release notes * adding LICENSE header * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * refactoring * fixing docstring * fixing types * test docstrings * renaming test * handling too-many-arguments * liting * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * changing name * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/rankers/metadata_grouper.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * assiging value inside function for re-use * improving docstring * updating name to MetaFieldGroupingRanker * adding to pydocs * fixing imports * adding output docstring * Update haystack/components/rankers/meta_field_grouper_ranker.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/rankers/__init__.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/rankers/test_metadata_grouper.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * update docstring tests * fixing imports * rename modules for consistency * fix pydocs * simplification + more tests --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
fcdf392bfb
commit
e5a80722c2
@ -1,7 +1,7 @@
|
||||
loaders:
|
||||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
||||
search_path: [../../../haystack/components/rankers]
|
||||
modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
|
||||
modules: ["lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "transformers_similarity", "sentence_transformers_diversity"]
|
||||
ignore_when_discovered: ["__init__"]
|
||||
processors:
|
||||
- type: filter
|
||||
|
||||
@ -4,12 +4,14 @@
|
||||
|
||||
from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
|
||||
from haystack.components.rankers.meta_field import MetaFieldRanker
|
||||
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
|
||||
from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
|
||||
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker
|
||||
|
||||
__all__ = [
|
||||
"LostInTheMiddleRanker",
|
||||
"MetaFieldRanker",
|
||||
"MetaFieldGroupingRanker",
|
||||
"SentenceTransformersDiversityRanker",
|
||||
"TransformersSimilarityRanker",
|
||||
]
|
||||
|
||||
118
haystack/components/rankers/meta_field_grouping_ranker.py
Normal file
118
haystack/components/rankers/meta_field_grouping_ranker.py
Normal file
@ -0,0 +1,118 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Optional, cast
|
||||
|
||||
from haystack import Document, component, logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class MetaFieldGroupingRanker:
|
||||
"""
|
||||
Reorders the documents by grouping them based on metadata keys.
|
||||
|
||||
The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional
|
||||
secondary key, `subgroup_by`.
|
||||
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.
|
||||
|
||||
The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
|
||||
Any documents without a group are placed at the end of the list.
|
||||
|
||||
The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.
|
||||
|
||||
### Usage example
|
||||
|
||||
```python
|
||||
from haystack.components.rankers import MetaFieldGroupingRanker
|
||||
from haystack.dataclasses import Document
|
||||
|
||||
|
||||
docs = [
|
||||
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
|
||||
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
|
||||
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
|
||||
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
|
||||
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
|
||||
]
|
||||
|
||||
ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
|
||||
result = ranker.run(documents=docs)
|
||||
print(result["documents"])
|
||||
|
||||
# [
|
||||
# Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
|
||||
# content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
|
||||
# Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
|
||||
# content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
|
||||
# Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
|
||||
# content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
|
||||
# Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
|
||||
# content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
|
||||
# Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
|
||||
# content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
|
||||
# ]
|
||||
```
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
|
||||
"""
|
||||
Creates an instance of DeepsetMetadataGrouper.
|
||||
|
||||
:param group_by: The metadata key to aggregate the documents by.
|
||||
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
|
||||
`group_by` key.
|
||||
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
|
||||
documents within the groups or subgroups are not sorted and are kept in the same order as
|
||||
they were inserted in the subgroups.
|
||||
|
||||
"""
|
||||
self.group_by = group_by
|
||||
self.sort_docs_by = sort_docs_by
|
||||
self.subgroup_by = subgroup_by
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document]) -> Dict[str, Any]:
|
||||
"""
|
||||
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.
|
||||
|
||||
The output is a list of documents reordered based on how they were grouped.
|
||||
|
||||
:param documents: The list of documents to group.
|
||||
:returns:
|
||||
A dictionary with the following keys:
|
||||
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
|
||||
"""
|
||||
|
||||
if not documents:
|
||||
return {"documents": []}
|
||||
|
||||
document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list))
|
||||
no_group_docs = []
|
||||
|
||||
for doc in documents:
|
||||
group_value = str(doc.meta.get(self.group_by, ""))
|
||||
|
||||
if group_value:
|
||||
subgroup_value = "no_subgroup"
|
||||
if self.subgroup_by and self.subgroup_by in doc.meta:
|
||||
subgroup_value = doc.meta[self.subgroup_by]
|
||||
|
||||
document_groups[group_value][subgroup_value].append(doc)
|
||||
else:
|
||||
no_group_docs.append(doc)
|
||||
|
||||
ordered_docs = []
|
||||
for group in document_groups:
|
||||
for subgroup in document_groups[group]:
|
||||
docs = document_groups[group][subgroup]
|
||||
if self.sort_docs_by:
|
||||
docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf")))
|
||||
ordered_docs.extend(docs)
|
||||
|
||||
ordered_docs.extend(no_group_docs)
|
||||
|
||||
return {"documents": ordered_docs}
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM.
|
||||
@ -332,4 +332,4 @@ class TestOpenAIGenerator:
|
||||
"Can you explain the Pitagoras therom?",
|
||||
system_prompt="You answer in German, regardless of the language on which a question is asked.",
|
||||
)
|
||||
assert "pythagoras".lower() in result["replies"][0].lower()
|
||||
assert "pythagoras" in result["replies"][0].lower()
|
||||
|
||||
181
test/components/rankers/test_meta_field_grouping_ranker.py
Normal file
181
test/components/rankers/test_meta_field_grouping_ranker.py
Normal file
@ -0,0 +1,181 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from haystack import Pipeline
|
||||
from haystack.dataclasses import Document
|
||||
|
||||
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
|
||||
|
||||
DOC_LIST = [
|
||||
# regular
|
||||
Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
|
||||
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
|
||||
Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}),
|
||||
Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}),
|
||||
Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
|
||||
Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}),
|
||||
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
|
||||
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
|
||||
# without split id
|
||||
Document(content="without split id", meta={"group": "11"}),
|
||||
Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}),
|
||||
Document(content="without split id3", meta={"group": "11"}),
|
||||
# with list values in the metadata
|
||||
Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}),
|
||||
Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}),
|
||||
Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}),
|
||||
]
|
||||
|
||||
|
||||
class TestMetaFieldGroupingRanker:
|
||||
def test_init_default(self) -> None:
|
||||
"""
|
||||
Test the default initialization of the MetaFieldGroupingRanker component.
|
||||
"""
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None)
|
||||
result = sample_ranker.run(documents=[])
|
||||
assert "documents" in result
|
||||
assert result["documents"] == []
|
||||
|
||||
def test_run_group_by_only(self) -> None:
|
||||
"""
|
||||
Test the MetaFieldGroupingRanker component with only the 'group_by' parameter. No subgroup or sorting is done.
|
||||
"""
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group")
|
||||
result = sample_ranker.run(documents=DOC_LIST)
|
||||
assert "documents" in result
|
||||
assert len(DOC_LIST) == len(result["documents"])
|
||||
assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42"
|
||||
assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42"
|
||||
assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42"
|
||||
assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314"
|
||||
assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314"
|
||||
assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11"
|
||||
assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11"
|
||||
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
|
||||
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
|
||||
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
|
||||
assert result["documents"][10].content == "bla bla bla bla"
|
||||
|
||||
def test_with_group_subgroup_and_sorting(self) -> None:
|
||||
"""
|
||||
Test the MetaFieldGroupingRanker component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup',
|
||||
and sorting by 'split_id'.
|
||||
"""
|
||||
ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
|
||||
result = ranker.run(documents=DOC_LIST)
|
||||
|
||||
assert "documents" in result
|
||||
assert len(DOC_LIST) == len(result["documents"])
|
||||
assert (
|
||||
result["documents"][0].meta["subgroup"] == "subB"
|
||||
and result["documents"][0].meta["group"] == "42"
|
||||
and result["documents"][0].meta["split_id"] == 3
|
||||
)
|
||||
assert (
|
||||
result["documents"][1].meta["subgroup"] == "subB"
|
||||
and result["documents"][1].meta["group"] == "42"
|
||||
and result["documents"][1].meta["split_id"] == 4
|
||||
)
|
||||
assert (
|
||||
result["documents"][2].meta["subgroup"] == "subB"
|
||||
and result["documents"][2].meta["group"] == "42"
|
||||
and result["documents"][2].meta["split_id"] == 7
|
||||
)
|
||||
assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314"
|
||||
assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314"
|
||||
assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11"
|
||||
assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11"
|
||||
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
|
||||
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
|
||||
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
|
||||
assert result["documents"][10].content == "bla bla bla bla"
|
||||
|
||||
def test_run_with_lists(self) -> None:
|
||||
"""
|
||||
Test if the MetaFieldGroupingRanker component can handle list values in the metadata.
|
||||
"""
|
||||
ranker = MetaFieldGroupingRanker(group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id")
|
||||
result = ranker.run(documents=DOC_LIST)
|
||||
assert "documents" in result
|
||||
assert len(DOC_LIST) == len(result["documents"])
|
||||
assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"]
|
||||
assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"]
|
||||
assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"]
|
||||
|
||||
def test_run_empty_input(self) -> None:
|
||||
"""
|
||||
Test the behavior of the MetaFieldGroupingRanker component with an empty list of documents.
|
||||
"""
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group")
|
||||
result = sample_ranker.run(documents=[])
|
||||
assert "documents" in result
|
||||
assert result["documents"] == []
|
||||
|
||||
def test_run_missing_metadata_keys(self) -> None:
|
||||
"""
|
||||
Test the behavior of the MetaFieldGroupingRanker component when some documents are missing the required metadata keys.
|
||||
"""
|
||||
docs_with_missing_keys = [
|
||||
Document(content="Document without group", meta={"split_id": 1, "subgroup": "subA"}),
|
||||
Document(content="Document without subgroup", meta={"group": "42", "split_id": 2}),
|
||||
Document(content="Document with all keys", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
|
||||
]
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
|
||||
result = sample_ranker.run(documents=docs_with_missing_keys)
|
||||
assert "documents" in result
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].meta["group"] == "42"
|
||||
assert result["documents"][1].meta["group"] == "42"
|
||||
assert result["documents"][2].content == "Document without group"
|
||||
|
||||
def test_run_metadata_with_different_data_types(self) -> None:
|
||||
"""
|
||||
Test the behavior of the MetaFieldGroupingRanker component when the metadata values have different data types.
|
||||
"""
|
||||
docs_with_mixed_data_types = [
|
||||
Document(content="Document with string group", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
|
||||
Document(content="Document with number group", meta={"group": 42, "split_id": 2, "subgroup": "subB"}),
|
||||
Document(content="Document with boolean group", meta={"group": True, "split_id": 3, "subgroup": "subC"}),
|
||||
]
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
|
||||
result = sample_ranker.run(documents=docs_with_mixed_data_types)
|
||||
assert "documents" in result
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].meta["group"] == "42"
|
||||
assert result["documents"][1].meta["group"] == 42
|
||||
assert result["documents"][2].meta["group"] is True
|
||||
|
||||
def test_run_duplicate_documents(self) -> None:
|
||||
"""
|
||||
Test the behavior of the MetaFieldGroupingRanker component when the input contains duplicate documents.
|
||||
"""
|
||||
docs_with_duplicates = [
|
||||
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
|
||||
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
|
||||
Document(content="Unique document", meta={"group": "42", "split_id": 2, "subgroup": "subB"}),
|
||||
]
|
||||
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
|
||||
result = sample_ranker.run(documents=docs_with_duplicates)
|
||||
assert "documents" in result
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].content == "Duplicate 1"
|
||||
assert result["documents"][1].content == "Duplicate 1"
|
||||
assert result["documents"][2].content == "Unique document"
|
||||
|
||||
def test_run_in_pipeline_dumps_and_loads(self) -> None:
|
||||
"""
|
||||
Test if the MetaFieldGroupingRanker component can be dumped to a YAML string and reloaded from it.
|
||||
"""
|
||||
ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id")
|
||||
result_single = ranker.run(documents=DOC_LIST)
|
||||
pipeline = Pipeline()
|
||||
pipeline.add_component("ranker", ranker)
|
||||
pipeline_yaml_str = pipeline.dumps()
|
||||
pipeline_reloaded = Pipeline().loads(pipeline_yaml_str)
|
||||
result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST})
|
||||
result = result["ranker"]
|
||||
assert result_single == result
|
||||
Loading…
x
Reference in New Issue
Block a user