docs: fixing AutoMergingRetriever docstring code (#10231)

* initial import

* cleaning leftover

* removing policy

* fixing output

* Apply suggestion from @anakin87

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update other retrievers causing issues in pydoc snipptet runs

* Update haystack/components/retrievers/auto_merging_retriever.py

---------

Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
David S. Batista 2025-12-12 15:41:32 +01:00 committed by GitHub
parent 2cd21982df
commit a6eb23c451
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 37 additions and 35 deletions

View File

@ -42,24 +42,26 @@ class AutoMergingRetriever:
# create a hierarchical document structure with 3 levels, where the parent document has 3 children
text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing."
original_document = Document(content=text)
builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word")
builder = HierarchicalDocumentSplitter(block_sizes={10, 3}, split_overlap=0, split_by="word")
docs = builder.run([original_document])["documents"]
# store level-1 parent documents and initialize the retriever
doc_store_parents = InMemoryDocumentStore()
for doc in docs["documents"]:
if doc.meta["children_ids"] and doc.meta["level"] == 1:
for doc in docs:
if doc.meta["__children_ids"] and doc.meta["__level"] in [0,1]: # store the root document and level 1 documents
doc_store_parents.write_documents([doc])
retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)
# assume we retrieved 2 leaf docs from the same parent, the parent document should be returned,
# since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]]
docs = retriever.run(leaf_docs[4:6])
>> {'documents': [Document(id=538..),
>> content: 'warm glow over the trees. Birds began to sing.',
>> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
>> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
leaf_docs = [doc for doc in docs if not doc.meta["__children_ids"]]
retrieved_docs = retriever.run(leaf_docs[4:6])
print(retrieved_docs["documents"])
# [Document(id=538..),
# content: 'warm glow over the trees. Birds began to sing.',
# meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
# 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
```
""" # noqa: E501

View File

@ -64,12 +64,12 @@ class MultiQueryEmbeddingRetriever:
result = multi_query_retriever.run(queries=queries)
for doc in result["documents"]:
print(f"Content: {doc.content}, Score: {doc.score}")
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
>> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
>> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
>> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
>> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
# >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
# >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
# >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
# >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
# >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
# >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
```
""" # noqa E501

View File

@ -49,10 +49,10 @@ class MultiQueryTextRetriever:
results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
for doc in results["documents"]:
print(f"Content: {doc.content}, Score: {doc.score}")
>>
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
>> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
# >>
# >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
# >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
# >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
```
""" # noqa E501

View File

@ -63,22 +63,22 @@ class SentenceWindowRetriever:
rag.run({'bm25_retriever': {"query":"third"}})
>> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
>> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
>> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
>> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
>> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
>> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
>> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
>> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
>> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
>> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
>> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
>> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
>> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
>> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
>> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
>> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
# >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
# >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
# >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
# >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
# >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
# >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
# >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
# >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '.',
# >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
# >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
# >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
# >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
# >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
# >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
# >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
# >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
```
"""