docs: fixing AutoMergingRetriever docstring code (#10231)

* initial import * cleaning leftover * removing policy * fixing output * Apply suggestion from @anakin87 Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update other retrievers causing issues in pydoc snipptet runs * Update haystack/components/retrievers/auto_merging_retriever.py --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
2026-01-07 04:27:15 +00:00 · 2025-12-12 15:41:32 +01:00 · 2025-12-12 15:41:32 +01:00 · a6eb23c451
commit a6eb23c451
parent 2cd21982df
4 changed files with 37 additions and 35 deletions
--- a/haystack/components/retrievers/auto_merging_retriever.py
+++ b/haystack/components/retrievers/auto_merging_retriever.py
@ -42,24 +42,26 @@ class AutoMergingRetriever:
    # create a hierarchical document structure with 3 levels, where the parent document has 3 children
    text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing."
    original_document = Document(content=text)
-    builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word")
+    builder = HierarchicalDocumentSplitter(block_sizes={10, 3}, split_overlap=0, split_by="word")
    docs = builder.run([original_document])["documents"]

    # store level-1 parent documents and initialize the retriever
    doc_store_parents = InMemoryDocumentStore()
-    for doc in docs["documents"]:
-        if doc.meta["children_ids"] and doc.meta["level"] == 1:
+    for doc in docs:
+        if doc.meta["__children_ids"] and doc.meta["__level"] in [0,1]:  # store the root document and level 1 documents
            doc_store_parents.write_documents([doc])
+
    retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)

    # assume we retrieved 2 leaf docs from the same parent, the parent document should be returned,
    # since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
-    leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]]
-    docs = retriever.run(leaf_docs[4:6])
-    >> {'documents': [Document(id=538..),
-    >> content: 'warm glow over the trees. Birds began to sing.',
-    >> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
-    >> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
+    leaf_docs = [doc for doc in docs if not doc.meta["__children_ids"]]
+    retrieved_docs = retriever.run(leaf_docs[4:6])
+    print(retrieved_docs["documents"])
+    # [Document(id=538..),
+    # content: 'warm glow over the trees. Birds began to sing.',
+    # meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
+    # 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
    ```
    """  # noqa: E501

--- a/haystack/components/retrievers/multi_query_embedding_retriever.py
+++ b/haystack/components/retrievers/multi_query_embedding_retriever.py
@ -64,12 +64,12 @@ class MultiQueryEmbeddingRetriever:
    result = multi_query_retriever.run(queries=queries)
    for doc in result["documents"]:
        print(f"Content: {doc.content}, Score: {doc.score}")
-    >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
-    >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
-    >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
-    >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
-    >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
-    >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
+    # >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
+    # >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
+    # >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
+    # >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
+    # >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
+    # >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
    ```
    """  # noqa E501

--- a/haystack/components/retrievers/multi_query_text_retriever.py
+++ b/haystack/components/retrievers/multi_query_text_retriever.py
@ -49,10 +49,10 @@ class MultiQueryTextRetriever:
    results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
    for doc in results["documents"]:
        print(f"Content: {doc.content}, Score: {doc.score}")
-    >>
-    >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
-    >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
-    >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
+    # >>
+    # >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
+    # >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
+    # >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
    ```
    """  # noqa E501

--- a/haystack/components/retrievers/sentence_window_retriever.py
+++ b/haystack/components/retrievers/sentence_window_retriever.py
@ -63,22 +63,22 @@ class SentenceWindowRetriever:

    rag.run({'bm25_retriever': {"query":"third"}})

-    >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
-    >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
-    >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
-    >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
-    >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
-    >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
-    >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
-    >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
-    >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
-    >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
-    >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
-    >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
-    >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
-    >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
-    >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
-    >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
+    # >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
+    # >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
+    # >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
+    # >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
+    # >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
+    # >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
+    # >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
+    # >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '.',
+    # >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
+    # >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
+    # >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
+    # >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
+    # >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
+    # >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
+    # >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
+    # >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
    ```
    """