diff --git a/haystack/components/retrievers/auto_merging_retriever.py b/haystack/components/retrievers/auto_merging_retriever.py index a9b229d6b..7ac4220e4 100644 --- a/haystack/components/retrievers/auto_merging_retriever.py +++ b/haystack/components/retrievers/auto_merging_retriever.py @@ -42,24 +42,26 @@ class AutoMergingRetriever: # create a hierarchical document structure with 3 levels, where the parent document has 3 children text = "The sun rose early in the morning. It cast a warm glow over the trees. Birds began to sing." original_document = Document(content=text) - builder = HierarchicalDocumentSplitter(block_sizes=[10, 3], split_overlap=0, split_by="word") + builder = HierarchicalDocumentSplitter(block_sizes={10, 3}, split_overlap=0, split_by="word") docs = builder.run([original_document])["documents"] # store level-1 parent documents and initialize the retriever doc_store_parents = InMemoryDocumentStore() - for doc in docs["documents"]: - if doc.meta["children_ids"] and doc.meta["level"] == 1: + for doc in docs: + if doc.meta["__children_ids"] and doc.meta["__level"] in [0,1]: # store the root document and level 1 documents doc_store_parents.write_documents([doc]) + retriever = AutoMergingRetriever(doc_store_parents, threshold=0.5) # assume we retrieved 2 leaf docs from the same parent, the parent document should be returned, # since it has 3 children and the threshold=0.5, and we retrieved 2 children (2/3 > 0.66(6)) - leaf_docs = [doc for doc in docs["documents"] if not doc.meta["children_ids"]] - docs = retriever.run(leaf_docs[4:6]) - >> {'documents': [Document(id=538..), - >> content: 'warm glow over the trees. Birds began to sing.', - >> meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...', - >> 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]} + leaf_docs = [doc for doc in docs if not doc.meta["__children_ids"]] + retrieved_docs = retriever.run(leaf_docs[4:6]) + print(retrieved_docs["documents"]) + # [Document(id=538..), + # content: 'warm glow over the trees. Birds began to sing.', + # meta: {'block_size': 10, 'parent_id': '835..', 'children_ids': ['c17...', '3ff...', '352...'], 'level': 1, 'source_id': '835...', + # 'page_number': 1, 'split_id': 1, 'split_idx_start': 45})]} ``` """ # noqa: E501 diff --git a/haystack/components/retrievers/multi_query_embedding_retriever.py b/haystack/components/retrievers/multi_query_embedding_retriever.py index ac3b73267..fdde2e1d5 100644 --- a/haystack/components/retrievers/multi_query_embedding_retriever.py +++ b/haystack/components/retrievers/multi_query_embedding_retriever.py @@ -64,12 +64,12 @@ class MultiQueryEmbeddingRetriever: result = multi_query_retriever.run(queries=queries) for doc in result["documents"]: print(f"Content: {doc.content}, Score: {doc.score}") - >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574 - >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034 - >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354 - >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680 - >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622 - >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243 + # >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574 + # >> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034 + # >> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354 + # >> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680 + # >> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.30914239725622 + # >> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243 ``` """ # noqa E501 diff --git a/haystack/components/retrievers/multi_query_text_retriever.py b/haystack/components/retrievers/multi_query_text_retriever.py index 84557aeb6..cd4282a51 100644 --- a/haystack/components/retrievers/multi_query_text_retriever.py +++ b/haystack/components/retrievers/multi_query_text_retriever.py @@ -49,10 +49,10 @@ class MultiQueryTextRetriever: results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"]) for doc in results["documents"]: print(f"Content: {doc.content}, Score: {doc.score}") - >> - >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097 - >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615 - >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944 + # >> + # >> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097 + # >> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.615 + # >> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944 ``` """ # noqa E501 diff --git a/haystack/components/retrievers/sentence_window_retriever.py b/haystack/components/retrievers/sentence_window_retriever.py index b3760658f..775905d45 100644 --- a/haystack/components/retrievers/sentence_window_retriever.py +++ b/haystack/components/retrievers/sentence_window_retriever.py @@ -63,22 +63,22 @@ class SentenceWindowRetriever: rag.run({'bm25_retriever': {"query":"third"}}) - >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence. - >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth - >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence. - >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20, - >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}), - >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ', - >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1, - >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...', - >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ', - >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap': - >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content: - >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1, - >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)}, - >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence. - >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132, - >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}} + # >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence. + # >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth + # >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence. + # >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20, + # >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}), + # >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ', + # >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1, + # >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '.', + # >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ', + # >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap': + # >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content: + # >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1, + # >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)}, + # >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence. + # >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132, + # >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}} ``` """