mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 04:27:15 +00:00
389 lines
13 KiB
Markdown
389 lines
13 KiB
Markdown
|
|
---
|
||
|
|
title: "Retrievers"
|
||
|
|
id: experimental-retrievers-api
|
||
|
|
description: "Sweep through Document Stores and return a set of candidate documents that are relevant to the query."
|
||
|
|
slug: "/experimental-retrievers-api"
|
||
|
|
---
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever"></a>
|
||
|
|
|
||
|
|
## Module haystack\_experimental.components.retrievers.chat\_message\_retriever
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever.ChatMessageRetriever"></a>
|
||
|
|
|
||
|
|
### ChatMessageRetriever
|
||
|
|
|
||
|
|
Retrieves chat messages from the underlying ChatMessageStore.
|
||
|
|
|
||
|
|
Usage example:
|
||
|
|
```python
|
||
|
|
from haystack.dataclasses import ChatMessage
|
||
|
|
from haystack_experimental.components.retrievers import ChatMessageRetriever
|
||
|
|
from haystack_experimental.chat_message_stores.in_memory import InMemoryChatMessageStore
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
ChatMessage.from_assistant("Hello, how can I help you?"),
|
||
|
|
ChatMessage.from_user("Hi, I have a question about Python. What is a Protocol?"),
|
||
|
|
]
|
||
|
|
|
||
|
|
message_store = InMemoryChatMessageStore()
|
||
|
|
message_store.write_messages(messages)
|
||
|
|
retriever = ChatMessageRetriever(message_store)
|
||
|
|
|
||
|
|
result = retriever.run()
|
||
|
|
|
||
|
|
print(result["messages"])
|
||
|
|
```
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever.ChatMessageRetriever.__init__"></a>
|
||
|
|
|
||
|
|
#### ChatMessageRetriever.\_\_init\_\_
|
||
|
|
|
||
|
|
```python
|
||
|
|
def __init__(message_store: ChatMessageStore, last_k: int = 10)
|
||
|
|
```
|
||
|
|
|
||
|
|
Create the ChatMessageRetriever component.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `message_store`: An instance of a ChatMessageStore.
|
||
|
|
- `last_k`: The number of last messages to retrieve. Defaults to 10 messages if not specified.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever.ChatMessageRetriever.to_dict"></a>
|
||
|
|
|
||
|
|
#### ChatMessageRetriever.to\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
def to_dict() -> Dict[str, Any]
|
||
|
|
```
|
||
|
|
|
||
|
|
Serializes the component to a dictionary.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
Dictionary with serialized data.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever.ChatMessageRetriever.from_dict"></a>
|
||
|
|
|
||
|
|
#### ChatMessageRetriever.from\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
@classmethod
|
||
|
|
def from_dict(cls, data: Dict[str, Any]) -> "ChatMessageRetriever"
|
||
|
|
```
|
||
|
|
|
||
|
|
Deserializes the component from a dictionary.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `data`: The dictionary to deserialize from.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
The deserialized component.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.chat_message_retriever.ChatMessageRetriever.run"></a>
|
||
|
|
|
||
|
|
#### ChatMessageRetriever.run
|
||
|
|
|
||
|
|
```python
|
||
|
|
@component.output_types(messages=List[ChatMessage])
|
||
|
|
def run(last_k: Optional[int] = None) -> Dict[str, List[ChatMessage]]
|
||
|
|
```
|
||
|
|
|
||
|
|
Run the ChatMessageRetriever
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `last_k`: The number of last messages to retrieve. This parameter takes precedence over the last_k
|
||
|
|
parameter passed to the ChatMessageRetriever constructor. If unspecified, the last_k parameter passed
|
||
|
|
to the constructor will be used.
|
||
|
|
|
||
|
|
**Raises**:
|
||
|
|
|
||
|
|
- `ValueError`: If last_k is not None and is less than 1
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
- `messages` - The retrieved chat messages.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever"></a>
|
||
|
|
|
||
|
|
## Module haystack\_experimental.components.retrievers.multi\_query\_embedding\_retriever
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever"></a>
|
||
|
|
|
||
|
|
### MultiQueryEmbeddingRetriever
|
||
|
|
|
||
|
|
A component that retrieves documents using multiple queries in parallel with an embedding-based retriever.
|
||
|
|
|
||
|
|
This component takes a list of text queries, converts them to embeddings using a query embedder,
|
||
|
|
and then uses an embedding-based retriever to find relevant documents for each query in parallel.
|
||
|
|
The results are combined and sorted by relevance score.
|
||
|
|
|
||
|
|
### Usage example
|
||
|
|
|
||
|
|
```python
|
||
|
|
from haystack import Document
|
||
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
||
|
|
from haystack.document_stores.types import DuplicatePolicy
|
||
|
|
from haystack.components.embedders import SentenceTransformersTextEmbedder
|
||
|
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
||
|
|
from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
||
|
|
from haystack.components.writers import DocumentWriter
|
||
|
|
from haystack_experimental.components.retrievers import MultiQueryEmbeddingRetriever
|
||
|
|
|
||
|
|
documents = [
|
||
|
|
Document(content="Renewable energy is energy that is collected from renewable resources."),
|
||
|
|
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
|
||
|
|
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
|
||
|
|
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth."),
|
||
|
|
Document(content="Biomass energy is produced from organic materials, such as plant and animal waste."),
|
||
|
|
Document(content="Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources."),
|
||
|
|
]
|
||
|
|
|
||
|
|
# Populate the document store
|
||
|
|
doc_store = InMemoryDocumentStore()
|
||
|
|
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
|
||
|
|
doc_embedder.warm_up()
|
||
|
|
doc_writer = DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP)
|
||
|
|
documents = doc_embedder.run(documents)["documents"]
|
||
|
|
doc_writer.run(documents=documents)
|
||
|
|
|
||
|
|
# Run the multi-query retriever
|
||
|
|
in_memory_retriever = InMemoryEmbeddingRetriever(document_store=doc_store, top_k=1)
|
||
|
|
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
|
||
|
|
|
||
|
|
multi_query_retriever = MultiQueryEmbeddingRetriever(
|
||
|
|
retriever=in_memory_retriever,
|
||
|
|
query_embedder=query_embedder,
|
||
|
|
max_workers=3
|
||
|
|
)
|
||
|
|
|
||
|
|
queries = ["Geothermal energy", "natural gas", "turbines"]
|
||
|
|
result = multi_query_retriever.run(queries=queries)
|
||
|
|
for doc in result["documents"]:
|
||
|
|
print(f"Content: {doc.content}, Score: {doc.score}")
|
||
|
|
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
|
||
|
|
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
|
||
|
|
>> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
|
||
|
|
>> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680995796
|
||
|
|
>> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.3091423972562246
|
||
|
|
>> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243668087
|
||
|
|
```
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever.__init__"></a>
|
||
|
|
|
||
|
|
#### MultiQueryEmbeddingRetriever.\_\_init\_\_
|
||
|
|
|
||
|
|
```python
|
||
|
|
def __init__(*,
|
||
|
|
retriever: EmbeddingRetriever,
|
||
|
|
query_embedder: TextEmbedder,
|
||
|
|
max_workers: int = 3)
|
||
|
|
```
|
||
|
|
|
||
|
|
Initialize MultiQueryEmbeddingRetriever.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `retriever`: The embedding-based retriever to use for document retrieval.
|
||
|
|
- `query_embedder`: The query embedder to convert text queries to embeddings.
|
||
|
|
- `max_workers`: Maximum number of worker threads for parallel processing.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever.warm_up"></a>
|
||
|
|
|
||
|
|
#### MultiQueryEmbeddingRetriever.warm\_up
|
||
|
|
|
||
|
|
```python
|
||
|
|
def warm_up() -> None
|
||
|
|
```
|
||
|
|
|
||
|
|
Warm up the query embedder and the retriever if any has a warm_up method.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever.run"></a>
|
||
|
|
|
||
|
|
#### MultiQueryEmbeddingRetriever.run
|
||
|
|
|
||
|
|
```python
|
||
|
|
@component.output_types(documents=List[Document])
|
||
|
|
def run(queries: List[str],
|
||
|
|
retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, Any]
|
||
|
|
```
|
||
|
|
|
||
|
|
Retrieve documents using multiple queries in parallel.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `queries`: List of text queries to process.
|
||
|
|
- `retriever_kwargs`: Optional dictionary of arguments to pass to the retriever's run method.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
A dictionary containing:
|
||
|
|
- `documents`: List of retrieved documents sorted by relevance score.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever.to_dict"></a>
|
||
|
|
|
||
|
|
#### MultiQueryEmbeddingRetriever.to\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
def to_dict() -> dict[str, Any]
|
||
|
|
```
|
||
|
|
|
||
|
|
Serializes the component to a dictionary.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
A dictionary representing the serialized component.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_embedding_retriever.MultiQueryEmbeddingRetriever.from_dict"></a>
|
||
|
|
|
||
|
|
#### MultiQueryEmbeddingRetriever.from\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
@classmethod
|
||
|
|
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryEmbeddingRetriever"
|
||
|
|
```
|
||
|
|
|
||
|
|
Deserializes the component from a dictionary.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `data`: The dictionary to deserialize from.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
The deserialized component.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever"></a>
|
||
|
|
|
||
|
|
## Module haystack\_experimental.components.retrievers.multi\_query\_text\_retriever
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever"></a>
|
||
|
|
|
||
|
|
### MultiQueryTextRetriever
|
||
|
|
|
||
|
|
A component that retrieves documents using multiple queries in parallel with a text-based retriever.
|
||
|
|
|
||
|
|
This component takes a list of text queries and uses a text-based retriever to find relevant documents for each
|
||
|
|
query in parallel, using a thread pool to manage concurrent execution. The results are combined and sorted by
|
||
|
|
relevance score.
|
||
|
|
|
||
|
|
You can use this component in combination with QueryExpander component to enhance the retrieval process.
|
||
|
|
|
||
|
|
### Usage example
|
||
|
|
```python
|
||
|
|
from haystack import Document
|
||
|
|
from haystack.components.writers import DocumentWriter
|
||
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
||
|
|
from haystack.document_stores.types import DuplicatePolicy
|
||
|
|
from haystack.components.retrievers import InMemoryBM25Retriever
|
||
|
|
from haystack_experimental.components.query import QueryExpander
|
||
|
|
from haystack_experimental.components.retrievers.multi_query_text_retriever import MultiQueryTextRetriever
|
||
|
|
|
||
|
|
documents = [
|
||
|
|
Document(content="Renewable energy is energy that is collected from renewable resources."),
|
||
|
|
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
|
||
|
|
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
|
||
|
|
Document(content="Hydropower is a form of renewable energy using the flow of water to generate electricity."),
|
||
|
|
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth.")
|
||
|
|
]
|
||
|
|
|
||
|
|
document_store = InMemoryDocumentStore()
|
||
|
|
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
|
||
|
|
doc_writer.run(documents=documents)
|
||
|
|
|
||
|
|
in_memory_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=1)
|
||
|
|
multiquery_retriever = MultiQueryTextRetriever(retriever=in_memory_retriever)
|
||
|
|
results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
|
||
|
|
for doc in results["documents"]:
|
||
|
|
print(f"Content: {doc.content}, Score: {doc.score}")
|
||
|
|
>>
|
||
|
|
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
|
||
|
|
>> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.6157822790079805
|
||
|
|
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
|
||
|
|
```
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever.__init__"></a>
|
||
|
|
|
||
|
|
#### MultiQueryTextRetriever.\_\_init\_\_
|
||
|
|
|
||
|
|
```python
|
||
|
|
def __init__(retriever: TextRetriever, max_workers: int = 3)
|
||
|
|
```
|
||
|
|
|
||
|
|
Initialize MultiQueryTextRetriever.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `retriever`: The text-based retriever to use for document retrieval.
|
||
|
|
- `max_workers`: Maximum number of worker threads for parallel processing. Default is 3.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever.warm_up"></a>
|
||
|
|
|
||
|
|
#### MultiQueryTextRetriever.warm\_up
|
||
|
|
|
||
|
|
```python
|
||
|
|
def warm_up() -> None
|
||
|
|
```
|
||
|
|
|
||
|
|
Warm up the retriever if it has a warm_up method.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever.run"></a>
|
||
|
|
|
||
|
|
#### MultiQueryTextRetriever.run
|
||
|
|
|
||
|
|
```python
|
||
|
|
@component.output_types(documents=list[Document])
|
||
|
|
def run(queries: List[str],
|
||
|
|
retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, Any]
|
||
|
|
```
|
||
|
|
|
||
|
|
Retrieve documents using multiple queries in parallel.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `queries`: List of text queries to process.
|
||
|
|
- `retriever_kwargs`: Optional dictionary of arguments to pass to the retriever's run method.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
A dictionary containing:
|
||
|
|
`documents`: List of retrieved documents sorted by relevance score.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever.to_dict"></a>
|
||
|
|
|
||
|
|
#### MultiQueryTextRetriever.to\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
def to_dict() -> dict[str, Any]
|
||
|
|
```
|
||
|
|
|
||
|
|
Serializes the component to a dictionary.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
The serialized component as a dictionary.
|
||
|
|
|
||
|
|
<a id="haystack_experimental.components.retrievers.multi_query_text_retriever.MultiQueryTextRetriever.from_dict"></a>
|
||
|
|
|
||
|
|
#### MultiQueryTextRetriever.from\_dict
|
||
|
|
|
||
|
|
```python
|
||
|
|
@classmethod
|
||
|
|
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryTextRetriever"
|
||
|
|
```
|
||
|
|
|
||
|
|
Deserializes the component from a dictionary.
|
||
|
|
|
||
|
|
**Arguments**:
|
||
|
|
|
||
|
|
- `data`: The dictionary to deserialize from.
|
||
|
|
|
||
|
|
**Returns**:
|
||
|
|
|
||
|
|
The deserialized component.
|
||
|
|
|