From a5189dd035ffcd3d5861f57539e2c723a53cfe6c Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 12 Jan 2024 17:50:55 +0100 Subject: [PATCH] fix!: `InMemoryBM25Retriever` no longer returns documents that have a score of 0.0 (#6717) * fix!: `InMemoryBM25Retriever` no longer returns documents that have a score of 0.0 Also update tests to accommodate the new behavior. * Remove superfluous code --- .../document_stores/in_memory/document_store.py | 5 ++++- ...riever-zero-score-docs-67406062a76aa7f4.yaml | 3 +++ .../retrievers/test_in_memory_bm25_retriever.py | 7 +++---- test/document_stores/test_in_memory.py | 17 ++++++++--------- 4 files changed, 18 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/inmemorybm25retriever-zero-score-docs-67406062a76aa7f4.yaml diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index 027e9c4bd..44b2f6d55 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -212,8 +212,11 @@ class InMemoryDocumentStore: return_documents = [] for i in top_docs_positions: doc = all_documents[i] + score = docs_scores[i] + if score <= 0.0: + continue doc_fields = doc.to_dict() - doc_fields["score"] = docs_scores[i] + doc_fields["score"] = score return_document = Document.from_dict(doc_fields) return_documents.append(return_document) return return_documents diff --git a/releasenotes/notes/inmemorybm25retriever-zero-score-docs-67406062a76aa7f4.yaml b/releasenotes/notes/inmemorybm25retriever-zero-score-docs-67406062a76aa7f4.yaml new file mode 100644 index 000000000..3ae440162 --- /dev/null +++ b/releasenotes/notes/inmemorybm25retriever-zero-score-docs-67406062a76aa7f4.yaml @@ -0,0 +1,3 @@ +--- +fixes: + - Prevent InMemoryBM25Retriever from returning documents with a score of 0.0. diff --git a/test/components/retrievers/test_in_memory_bm25_retriever.py b/test/components/retrievers/test_in_memory_bm25_retriever.py index db5e82a3f..4c1df2f22 100644 --- a/test/components/retrievers/test_in_memory_bm25_retriever.py +++ b/test/components/retrievers/test_in_memory_bm25_retriever.py @@ -113,15 +113,14 @@ class TestMemoryBM25Retriever: InMemoryBM25Retriever.from_dict(data) def test_retriever_valid_run(self, mock_docs): - top_k = 5 ds = InMemoryDocumentStore() ds.write_documents(mock_docs) - retriever = InMemoryBM25Retriever(ds, top_k=top_k) + retriever = InMemoryBM25Retriever(ds, top_k=5) result = retriever.run(query="PHP") assert "documents" in result - assert len(result["documents"]) == top_k + assert len(result["documents"]) == 1 assert result["documents"][0].content == "PHP is a popular programming language" def test_invalid_run_wrong_store_type(self): @@ -174,5 +173,5 @@ class TestMemoryBM25Retriever: assert "retriever" in result results_docs = result["retriever"]["documents"] assert results_docs - assert len(results_docs) == top_k + assert len(results_docs) == 1 assert results_docs[0].content == query_result diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index 1d3a36131..9ebcae415 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -5,8 +5,8 @@ import pandas as pd import pytest from haystack import Document -from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.testing.document_store import DocumentStoreBaseTests @@ -17,7 +17,7 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 @pytest.fixture def document_store(self) -> InMemoryDocumentStore: - return InMemoryDocumentStore() + return InMemoryDocumentStore(bm25_algorithm="BM25L") def test_to_dict(self): store = InMemoryDocumentStore() @@ -73,7 +73,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 document_store.write_documents(docs) def test_bm25_retrieval(self, document_store: InMemoryDocumentStore): - document_store = InMemoryDocumentStore() # Tests if the bm25_retrieval method returns the correct document based on the input query. docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")] document_store.write_documents(docs) @@ -106,7 +105,7 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 document_store.write_documents(docs) # top_k = 2 - results = document_store.bm25_retrieval(query="languages", top_k=2) + results = document_store.bm25_retrieval(query="language", top_k=2) assert len(results) == 2 # top_k = 3 @@ -141,7 +140,7 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 document_store.write_documents(docs) results = document_store.bm25_retrieval(query="Python", top_k=1) - assert len(results) == 1 + assert len(results) == 0 document_store.write_documents([Document(content="Python is a popular programming language")]) results = document_store.bm25_retrieval(query="Python", top_k=1) @@ -199,10 +198,10 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 docs = [Document(), Document(content="Gardening"), Document(content="Bird watching")] document_store.write_documents(docs) results = document_store.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10) - assert len(results) == 2 + assert len(results) == 0 def test_bm25_retrieval_with_filters(self, document_store: InMemoryDocumentStore): - selected_document = Document(content="Gardening", meta={"selected": True}) + selected_document = Document(content="Java is, well...", meta={"selected": True}) docs = [Document(), selected_document, Document(content="Bird watching")] document_store.write_documents(docs) results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True}) @@ -224,10 +223,10 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 assert results[0].id == document.id def test_bm25_retrieval_with_documents_with_mixed_content(self, document_store: InMemoryDocumentStore): - double_document = Document(content="Gardening", embedding=[1.0, 2.0, 3.0]) + double_document = Document(content="Gardening is a hobby", embedding=[1.0, 2.0, 3.0]) docs = [Document(embedding=[1.0, 2.0, 3.0]), double_document, Document(content="Bird watching")] document_store.write_documents(docs) - results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"embedding": {"$not": None}}) + results = document_store.bm25_retrieval(query="Gardening", top_k=10, filters={"embedding": {"$not": None}}) assert len(results) == 1 assert results[0].id == double_document.id