feat: adding metadata grouper component (#8512)

* initial import

* making tests more readable; adding docstring

* adding release notes

* adding LICENSE header

* Update test/components/rankers/test_metadata_grouper.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* refactoring

* fixing docstring

* fixing types

* test docstrings

* renaming test

* handling too-many-arguments

* liting

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* changing name

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update haystack/components/rankers/metadata_grouper.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* assiging value inside function for re-use

* improving docstring

* updating name to MetaFieldGroupingRanker

* adding to pydocs

* fixing imports

* adding output docstring

* Update haystack/components/rankers/meta_field_grouper_ranker.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update haystack/components/rankers/__init__.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update test/components/rankers/test_metadata_grouper.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* update docstring tests

* fixing imports

* rename modules for consistency

* fix pydocs

* simplification + more tests

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
David S. Batista 2024-11-12 16:01:53 +01:00 committed by GitHub
parent fcdf392bfb
commit e5a80722c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 307 additions and 2 deletions

View File

@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/rankers]
modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
modules: ["lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "transformers_similarity", "sentence_transformers_diversity"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter

View File

@ -4,12 +4,14 @@
from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
from haystack.components.rankers.meta_field import MetaFieldRanker
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker
__all__ = [
"LostInTheMiddleRanker",
"MetaFieldRanker",
"MetaFieldGroupingRanker",
"SentenceTransformersDiversityRanker",
"TransformersSimilarityRanker",
]

View File

@ -0,0 +1,118 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from collections import defaultdict
from typing import Any, Dict, List, Optional, cast
from haystack import Document, component, logging
logger = logging.getLogger(__name__)
@component
class MetaFieldGroupingRanker:
"""
Reorders the documents by grouping them based on metadata keys.
The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional
secondary key, `subgroup_by`.
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.
The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
Any documents without a group are placed at the end of the list.
The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.
### Usage example
```python
from haystack.components.rankers import MetaFieldGroupingRanker
from haystack.dataclasses import Document
docs = [
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
]
ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
result = ranker.run(documents=docs)
print(result["documents"])
# [
# Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
# content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
# Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
# content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
# Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
# content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
# Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
# content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
# Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
# content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
# ]
```
""" # noqa: E501
def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
"""
Creates an instance of DeepsetMetadataGrouper.
:param group_by: The metadata key to aggregate the documents by.
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
`group_by` key.
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
documents within the groups or subgroups are not sorted and are kept in the same order as
they were inserted in the subgroups.
"""
self.group_by = group_by
self.sort_docs_by = sort_docs_by
self.subgroup_by = subgroup_by
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, Any]:
"""
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.
The output is a list of documents reordered based on how they were grouped.
:param documents: The list of documents to group.
:returns:
A dictionary with the following keys:
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
"""
if not documents:
return {"documents": []}
document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list))
no_group_docs = []
for doc in documents:
group_value = str(doc.meta.get(self.group_by, ""))
if group_value:
subgroup_value = "no_subgroup"
if self.subgroup_by and self.subgroup_by in doc.meta:
subgroup_value = doc.meta[self.subgroup_by]
document_groups[group_value][subgroup_value].append(doc)
else:
no_group_docs.append(doc)
ordered_docs = []
for group in document_groups:
for subgroup in document_groups[group]:
docs = document_groups[group][subgroup]
if self.sort_docs_by:
docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf")))
ordered_docs.extend(docs)
ordered_docs.extend(no_group_docs)
return {"documents": ordered_docs}

View File

@ -0,0 +1,4 @@
---
features:
- |
We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM.

View File

@ -332,4 +332,4 @@ class TestOpenAIGenerator:
"Can you explain the Pitagoras therom?",
system_prompt="You answer in German, regardless of the language on which a question is asked.",
)
assert "pythagoras".lower() in result["replies"][0].lower()
assert "pythagoras" in result["replies"][0].lower()

View File

@ -0,0 +1,181 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict
from haystack import Pipeline
from haystack.dataclasses import Document
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
DOC_LIST = [
# regular
Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}),
Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}),
Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
# without split id
Document(content="without split id", meta={"group": "11"}),
Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}),
Document(content="without split id3", meta={"group": "11"}),
# with list values in the metadata
Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}),
Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}),
Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}),
]
class TestMetaFieldGroupingRanker:
def test_init_default(self) -> None:
"""
Test the default initialization of the MetaFieldGroupingRanker component.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None)
result = sample_ranker.run(documents=[])
assert "documents" in result
assert result["documents"] == []
def test_run_group_by_only(self) -> None:
"""
Test the MetaFieldGroupingRanker component with only the 'group_by' parameter. No subgroup or sorting is done.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group")
result = sample_ranker.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42"
assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42"
assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11"
assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"
def test_with_group_subgroup_and_sorting(self) -> None:
"""
Test the MetaFieldGroupingRanker component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup',
and sorting by 'split_id'.
"""
ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = ranker.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert (
result["documents"][0].meta["subgroup"] == "subB"
and result["documents"][0].meta["group"] == "42"
and result["documents"][0].meta["split_id"] == 3
)
assert (
result["documents"][1].meta["subgroup"] == "subB"
and result["documents"][1].meta["group"] == "42"
and result["documents"][1].meta["split_id"] == 4
)
assert (
result["documents"][2].meta["subgroup"] == "subB"
and result["documents"][2].meta["group"] == "42"
and result["documents"][2].meta["split_id"] == 7
)
assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11"
assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"
def test_run_with_lists(self) -> None:
"""
Test if the MetaFieldGroupingRanker component can handle list values in the metadata.
"""
ranker = MetaFieldGroupingRanker(group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id")
result = ranker.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"]
assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"]
assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"]
def test_run_empty_input(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component with an empty list of documents.
"""
sample_ranker = MetaFieldGroupingRanker(group_by="group")
result = sample_ranker.run(documents=[])
assert "documents" in result
assert result["documents"] == []
def test_run_missing_metadata_keys(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when some documents are missing the required metadata keys.
"""
docs_with_missing_keys = [
Document(content="Document without group", meta={"split_id": 1, "subgroup": "subA"}),
Document(content="Document without subgroup", meta={"group": "42", "split_id": 2}),
Document(content="Document with all keys", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_missing_keys)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["group"] == "42"
assert result["documents"][2].content == "Document without group"
def test_run_metadata_with_different_data_types(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when the metadata values have different data types.
"""
docs_with_mixed_data_types = [
Document(content="Document with string group", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Document with number group", meta={"group": 42, "split_id": 2, "subgroup": "subB"}),
Document(content="Document with boolean group", meta={"group": True, "split_id": 3, "subgroup": "subC"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_mixed_data_types)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["group"] == 42
assert result["documents"][2].meta["group"] is True
def test_run_duplicate_documents(self) -> None:
"""
Test the behavior of the MetaFieldGroupingRanker component when the input contains duplicate documents.
"""
docs_with_duplicates = [
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Duplicate 1", meta={"group": "42", "split_id": 1, "subgroup": "subA"}),
Document(content="Unique document", meta={"group": "42", "split_id": 2, "subgroup": "subB"}),
]
sample_ranker = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_ranker.run(documents=docs_with_duplicates)
assert "documents" in result
assert len(result["documents"]) == 3
assert result["documents"][0].content == "Duplicate 1"
assert result["documents"][1].content == "Duplicate 1"
assert result["documents"][2].content == "Unique document"
def test_run_in_pipeline_dumps_and_loads(self) -> None:
"""
Test if the MetaFieldGroupingRanker component can be dumped to a YAML string and reloaded from it.
"""
ranker = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id")
result_single = ranker.run(documents=DOC_LIST)
pipeline = Pipeline()
pipeline.add_component("ranker", ranker)
pipeline_yaml_str = pipeline.dumps()
pipeline_reloaded = Pipeline().loads(pipeline_yaml_str)
result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST})
result = result["ranker"]
assert result_single == result