docs: Added LostInTheMiddleRanker usage example and updated docstrings (#7294)

* docs: Added LostInTheMiddleRanker usage example

* remove to_dict test

* explain LITM in more detail
This commit is contained in:
Julian Risch 2024-03-04 15:42:51 +01:00 committed by GitHub
parent 0e7c41be5e
commit 9a0e2e58fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 34 additions and 26 deletions

View File

@ -1,17 +1,34 @@
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional
from haystack import Document, component, default_to_dict
from haystack import Document, component
@component
class LostInTheMiddleRanker:
"""
The LostInTheMiddleRanker implements a ranker that reorders documents based on the "lost in the middle" order.
"Lost in the Middle: How Language Models Use Long Contexts" paper by Liu et al. aims to lay out paragraphs into LLM
context so that the relevant paragraphs are at the beginning or end of the input context, while the least relevant
information is in the middle of the context.
Ranks documents based on the 'lost in the middle' order so that the most relevant documents are either at the
beginning or end, while the least relevant are in the middle.
See https://arxiv.org/abs/2307.03172 for more details.
LostInTheMiddleRanker assumes that some prior component in the pipeline has already ranked documents by relevance
and requires no query as input but only documents. It is typically used as the last component before building a
prompt for an LLM to prepare the input context for the LLM.
Lost in the Middle ranking lays out document contents into LLM context so that the most relevant contents are at
the beginning or end of the input context, while the least relevant is in the middle of the context. See the
paper ["Lost in the Middle: How Language Models Use Long Contexts"](https://arxiv.org/abs/2307.03172) for more
details.
Usage example:
```python
from haystack.components.rankers import LostInTheMiddleRanker
from haystack import Document
ranker = LostInTheMiddleRanker()
docs = [Document(content="Paris"), Document(content="Berlin"), Document(content="Madrid")]
result = ranker.run(documents=docs)
for doc in result["documents"]:
print(doc.content)
```
"""
def __init__(self, word_count_threshold: Optional[int] = None, top_k: Optional[int] = None):
@ -34,23 +51,22 @@ class LostInTheMiddleRanker:
self.word_count_threshold = word_count_threshold
self.top_k = top_k
def to_dict(self) -> Dict[str, Any]:
"""
Serialize object to a dictionary.
"""
return default_to_dict(self, word_count_threshold=self.word_count_threshold, top_k=self.top_k)
@component.output_types(documents=List[Document])
def run(
self, documents: List[Document], top_k: Optional[int] = None, word_count_threshold: Optional[int] = None
) -> Dict[str, List[Document]]:
"""
Reranks documents based on the "lost in the middle" order.
Returns a list of Documents reordered based on the input query.
:param documents: List of Documents to reorder.
:param top_k: The number of documents to return.
:param word_count_threshold: The maximum total number of words across all documents selected by the ranker.
:return: The reordered documents.
:param documents: List of Documents to reorder.
:param top_k: The maximum number of documents to return.
:param word_count_threshold: The maximum total number of words across all documents selected by the ranker.
:returns:
A dictionary with the following keys:
- `documents`: Reranked list of Documents
:raises ValueError:
If any of the documents is not textual.
"""
if isinstance(word_count_threshold, int) and word_count_threshold <= 0:
raise ValueError(

View File

@ -94,11 +94,3 @@ class TestLostInTheMiddleRanker:
# top_k is greater than the number of documents, so all documents should be returned in LITM order
assert len(result["documents"]) == len(docs)
assert result == ranker.run(documents=docs)
def test_to_dict(self):
component = LostInTheMiddleRanker()
data = component.to_dict()
assert data == {
"type": "haystack.components.rankers.lost_in_the_middle.LostInTheMiddleRanker",
"init_parameters": {"word_count_threshold": None, "top_k": None},
}