mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 04:27:15 +00:00
docs: fixing AutoMergingRetriever docstring code (#10231)
* initial import * cleaning leftover * removing policy * fixing output * Apply suggestion from @anakin87 Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update other retrievers causing issues in pydoc snipptet runs * Update haystack/components/retrievers/auto_merging_retriever.py --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
2cd21982df
commit
a6eb23c451
@ -42,24 +42,26 @@ class AutoMergingRetriever:
|
||||
# create a hierarchical document structure with 3 levels, where the parent document has 3 children
|
||||
text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing."
|
||||
original_document = Document(content=text)
|
||||
builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word")
|
||||
builder = HierarchicalDocumentSplitter(block_sizes={10, 3}, split_overlap=0, split_by="word")
|
||||
docs = builder.run([original_document])["documents"]
|
||||
|
||||
# store level-1 parent documents and initialize the retriever
|
||||
doc_store_parents = InMemoryDocumentStore()
|
||||
for doc in docs["documents"]:
|
||||
if doc.meta["children_ids"] and doc.meta["level"] == 1:
|
||||
for doc in docs:
|
||||
if doc.meta["__children_ids"] and doc.meta["__level"] in [0,1]: # store the root document and level 1 documents
|
||||
doc_store_parents.write_documents([doc])
|
||||
|
||||
retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5)
|
||||
|
||||
# assume we retrieved 2 leaf docs from the same parent, the parent document should be returned,
|
||||
# since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6))
|
||||
leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]]
|
||||
docs = retriever.run(leaf_docs[4:6])
|
||||
>> {'documents': [Document(id=538..),
|
||||
>> content: 'warm glow over the trees. Birds began to sing.',
|
||||
>> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
|
||||
>> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
|
||||
leaf_docs = [doc for doc in docs if not doc.meta["__children_ids"]]
|
||||
retrieved_docs = retriever.run(leaf_docs[4:6])
|
||||
print(retrieved_docs["documents"])
|
||||
# [Document(id=538..),
|
||||
# content: 'warm glow over the trees. Birds began to sing.',
|
||||
# meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...',
|
||||
# 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]}
|
||||
```
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
@ -64,12 +64,12 @@ class MultiQueryEmbeddingRetriever:
|
||||
result = multi_query_retriever.run(queries=queries)
|
||||
for doc in result["documents"]:
|
||||
print(f"Content: {doc.content}, Score: {doc.score}")
|
||||
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
|
||||
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
|
||||
>> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
|
||||
>> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
|
||||
>> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
|
||||
>> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
|
||||
# >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
|
||||
# >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
|
||||
# >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
|
||||
# >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680
|
||||
# >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622
|
||||
# >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243
|
||||
```
|
||||
""" # noqa E501
|
||||
|
||||
|
||||
@ -49,10 +49,10 @@ class MultiQueryTextRetriever:
|
||||
results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
|
||||
for doc in results["documents"]:
|
||||
print(f"Content: {doc.content}, Score: {doc.score}")
|
||||
>>
|
||||
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
|
||||
>> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
|
||||
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
|
||||
# >>
|
||||
# >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
|
||||
# >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615
|
||||
# >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
|
||||
```
|
||||
""" # noqa E501
|
||||
|
||||
|
||||
@ -63,22 +63,22 @@ class SentenceWindowRetriever:
|
||||
|
||||
rag.run({'bm25_retriever': {"query":"third"}})
|
||||
|
||||
>> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
|
||||
>> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
|
||||
>> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
|
||||
>> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
|
||||
>> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
|
||||
>> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
|
||||
>> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
|
||||
>> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
|
||||
>> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
|
||||
>> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
|
||||
>> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
|
||||
>> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
|
||||
>> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
|
||||
>> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
|
||||
>> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
|
||||
>> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
|
||||
# >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
|
||||
# >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
|
||||
# >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
|
||||
# >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
|
||||
# >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
|
||||
# >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
|
||||
# >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
|
||||
# >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '.',
|
||||
# >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
|
||||
# >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
|
||||
# >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
|
||||
# >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
|
||||
# >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
|
||||
# >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
|
||||
# >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
|
||||
# >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user