mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-23 15:14:04 +00:00

* Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
90 lines
3.1 KiB
Python
90 lines
3.1 KiB
Python
import random
|
|
|
|
import pytest
|
|
|
|
from haystack.preview import Document, ComponentError
|
|
from haystack.preview.components.samplers.top_p import TopPSampler
|
|
|
|
|
|
class TestTopPSampler:
|
|
@pytest.mark.unit
|
|
def test_run_scores_from_metadata(self):
|
|
"""
|
|
Test if the component runs correctly with scores already in the metadata.
|
|
"""
|
|
sampler = TopPSampler(top_p=0.95, score_field="similarity_score")
|
|
docs = [
|
|
Document(content="Berlin", meta={"similarity_score": -10.6}),
|
|
Document(content="Belgrade", meta={"similarity_score": -8.9}),
|
|
Document(content="Sarajevo", meta={"similarity_score": -4.6}),
|
|
]
|
|
output = sampler.run(documents=docs)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert docs[0].content == "Sarajevo"
|
|
|
|
@pytest.mark.unit
|
|
def test_run_scores(self):
|
|
"""
|
|
Test if the component runs correctly with scores in the Document score field.
|
|
"""
|
|
sampler = TopPSampler(top_p=0.99)
|
|
docs = [
|
|
Document(content="Berlin", score=-10.6),
|
|
Document(content="Belgrade", score=-8.9),
|
|
Document(content="Sarajevo", score=-4.6),
|
|
]
|
|
|
|
random.shuffle(docs)
|
|
sorted_scores = sorted([doc.score for doc in docs], reverse=True)
|
|
|
|
# top_p = 0.99 will get the top 1 document
|
|
output = sampler.run(documents=docs)
|
|
docs_filtered = output["documents"]
|
|
assert len(docs_filtered) == 1
|
|
assert docs_filtered[0].content == "Sarajevo"
|
|
|
|
assert [doc.score for doc in docs_filtered] == sorted_scores[:1]
|
|
|
|
@pytest.mark.unit
|
|
def test_run_scores_top_p_1(self):
|
|
"""
|
|
Test if the component runs correctly top_p=1.
|
|
"""
|
|
sampler = TopPSampler(top_p=1.0)
|
|
docs = [
|
|
Document(content="Berlin", score=-10.6),
|
|
Document(content="Belgrade", score=-8.9),
|
|
Document(content="Sarajevo", score=-4.6),
|
|
]
|
|
|
|
random.shuffle(docs)
|
|
output = sampler.run(documents=docs)
|
|
docs_filtered = output["documents"]
|
|
assert len(docs_filtered) == len(docs)
|
|
assert docs_filtered[0].content == "Sarajevo"
|
|
|
|
assert [doc.score for doc in docs_filtered] == sorted([doc.score for doc in docs], reverse=True)
|
|
|
|
# Returns an empty list if no documents are provided
|
|
@pytest.mark.unit
|
|
def test_returns_empty_list_if_no_documents_are_provided(self):
|
|
sampler = TopPSampler()
|
|
output = sampler.run(documents=[])
|
|
assert output["documents"] == []
|
|
|
|
@pytest.mark.unit
|
|
def test_run_scores_no_metadata_present(self):
|
|
"""
|
|
Test if the component runs correctly with scores missing from the metadata yet being specified in the
|
|
score_field.
|
|
"""
|
|
sampler = TopPSampler(top_p=0.95, score_field="similarity_score")
|
|
docs = [
|
|
Document(content="Berlin", score=-10.6),
|
|
Document(content="Belgrade", score=-8.9),
|
|
Document(content="Sarajevo", score=-4.6),
|
|
]
|
|
with pytest.raises(ComponentError, match="Score field 'similarity_score' not found"):
|
|
sampler.run(documents=docs)
|