haystack/test/components/rankers/test_transformers_similarity.py

from unittest.mock import MagicMock, patch
import pytest
import torch

from haystack import Document, ComponentError
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker


class TestSimilarityRanker:
    def test_to_dict(self):
        component = TransformersSimilarityRanker()
        data = component.to_dict()
        assert data == {
            "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
            "init_parameters": {
                "device": "cpu",
                "top_k": 10,
                "token": None,
                "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
                "meta_fields_to_embed": [],
                "embedding_separator": "\n",
            },
        }

    def test_to_dict_with_custom_init_parameters(self):
        component = TransformersSimilarityRanker(
            model_name_or_path="my_model", device="cuda", token="my_token", top_k=5
        )
        data = component.to_dict()
        assert data == {
            "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
            "init_parameters": {
                "device": "cuda",
                "model_name_or_path": "my_model",
                "token": None,  # we don't serialize valid tokens,
                "top_k": 5,
                "meta_fields_to_embed": [],
                "embedding_separator": "\n",
            },
        }

    @patch("torch.sort")
    def test_embed_meta(self, mocked_sort):
        mocked_sort.return_value = (None, torch.tensor([0]))
        embedder = TransformersSimilarityRanker(
            model_name_or_path="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n"
        )
        embedder.model = MagicMock()
        embedder.tokenizer = MagicMock()

        documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)]

        embedder.run(query="test", documents=documents)

        embedder.tokenizer.assert_called_once_with(
            [
                ["test", "meta_value 0\ndocument number 0"],
                ["test", "meta_value 1\ndocument number 1"],
                ["test", "meta_value 2\ndocument number 2"],
                ["test", "meta_value 3\ndocument number 3"],
                ["test", "meta_value 4\ndocument number 4"],
            ],
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

    @pytest.mark.integration
    @pytest.mark.parametrize(
        "query,docs_before_texts,expected_first_text",
        [
            ("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"),
            ("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"),
            ("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"),
        ],
    )
    def test_run(self, query, docs_before_texts, expected_first_text):
        """
        Test if the component ranks documents correctly.
        """
        ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")
        ranker.warm_up()
        docs_before = [Document(content=text) for text in docs_before_texts]
        output = ranker.run(query=query, documents=docs_before)
        docs_after = output["documents"]

        assert len(docs_after) == 3
        assert docs_after[0].content == expected_first_text

        sorted_scores = sorted([doc.score for doc in docs_after], reverse=True)
        assert [doc.score for doc in docs_after] == sorted_scores

    #  Returns an empty list if no documents are provided
    @pytest.mark.integration
    def test_returns_empty_list_if_no_documents_are_provided(self):
        sampler = TransformersSimilarityRanker()
        sampler.warm_up()
        output = sampler.run(query="City in Germany", documents=[])
        assert not output["documents"]

    #  Raises ComponentError if model is not warmed up
    @pytest.mark.integration
    def test_raises_component_error_if_model_not_warmed_up(self):
        sampler = TransformersSimilarityRanker()

        with pytest.raises(ComponentError):
            sampler.run(query="query", documents=[Document(content="document")])

    @pytest.mark.integration
    @pytest.mark.parametrize(
        "query,docs_before_texts,expected_first_text",
        [
            ("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"),
            ("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"),
            ("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"),
        ],
    )
    def test_run_top_k(self, query, docs_before_texts, expected_first_text):
        """
        Test if the component ranks documents correctly with a custom top_k.
        """
        ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2", top_k=2)
        ranker.warm_up()
        docs_before = [Document(content=text) for text in docs_before_texts]
        output = ranker.run(query=query, documents=docs_before)
        docs_after = output["documents"]

        assert len(docs_after) == 2
        assert docs_after[0].content == expected_first_text

        sorted_scores = sorted([doc.score for doc in docs_after], reverse=True)
        assert [doc.score for doc in docs_after] == sorted_scores

    @pytest.mark.integration
    def test_run_single_document(self):
        """
        Test if the component runs with a single document.
        """
        ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")
        ranker.warm_up()
        docs_before = [Document(content="Berlin")]
        output = ranker.run(query="City in Germany", documents=docs_before)
        docs_after = output["documents"]

        assert len(docs_after) == 1
feat: Add `meta_fields_to_embed` to `TransformersSimilarityRanker` (#6564) * Add initial implementation following SentenceTransformersDocumentEmbedder * Add test for embedding metadata * Add release notes * Update name * Fix tests and to dict * Fix release notes 2023-12-18 11:28:16 +01:00			`from unittest.mock import MagicMock, patch`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`import pytest`
feat: Add `meta_fields_to_embed` to `TransformersSimilarityRanker` (#6564) * Add initial implementation following SentenceTransformersDocumentEmbedder * Add test for embedding metadata * Add release notes * Update name * Fix tests and to dict * Fix release notes 2023-12-18 11:28:16 +01:00			`import torch`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00
Fix all tests 2023-11-24 14:48:43 +01:00			`from haystack import Document, ComponentError`
			`from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00

			`class TestSimilarityRanker:`
			`def test_to_dict(self):`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`component = TransformersSimilarityRanker()`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`data = component.to_dict()`
			`assert data == {`
Fix all tests 2023-11-24 14:48:43 +01:00			`"type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`"init_parameters": {`
			`"device": "cpu",`
			`"top_k": 10,`
refactor: adopt `token` instead of `use_auth_token` in HF components (#6040) * move embedding backends * use token in Sentence Transformers embeddings * more compact token handling * token parameter in reader * add token to ranker * release note * add test for reader 2023-10-17 16:32:13 +02:00			`"token": None,`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`"model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",`
feat: Add `meta_fields_to_embed` to `TransformersSimilarityRanker` (#6564) * Add initial implementation following SentenceTransformersDocumentEmbedder * Add test for embedding metadata * Add release notes * Update name * Fix tests and to dict * Fix release notes 2023-12-18 11:28:16 +01:00			`"meta_fields_to_embed": [],`
			`"embedding_separator": "\n",`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`},`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`}`

			`def test_to_dict_with_custom_init_parameters(self):`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`component = TransformersSimilarityRanker(`
			`model_name_or_path="my_model", device="cuda", token="my_token", top_k=5`
			`)`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`data = component.to_dict()`
			`assert data == {`
Fix all tests 2023-11-24 14:48:43 +01:00			`"type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`"init_parameters": {`
refactor: adopt `token` instead of `use_auth_token` in HF components (#6040) * move embedding backends * use token in Sentence Transformers embeddings * more compact token handling * token parameter in reader * add token to ranker * release note * add test for reader 2023-10-17 16:32:13 +02:00			`"device": "cuda",`
			`"model_name_or_path": "my_model",`
			`"token": None, # we don't serialize valid tokens,`
			`"top_k": 5,`
feat: Add `meta_fields_to_embed` to `TransformersSimilarityRanker` (#6564) * Add initial implementation following SentenceTransformersDocumentEmbedder * Add test for embedding metadata * Add release notes * Update name * Fix tests and to dict * Fix release notes 2023-12-18 11:28:16 +01:00			`"meta_fields_to_embed": [],`
			`"embedding_separator": "\n",`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`},`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`}`

feat: Add `meta_fields_to_embed` to `TransformersSimilarityRanker` (#6564) * Add initial implementation following SentenceTransformersDocumentEmbedder * Add test for embedding metadata * Add release notes * Update name * Fix tests and to dict * Fix release notes 2023-12-18 11:28:16 +01:00			`@patch("torch.sort")`
			`def test_embed_meta(self, mocked_sort):`
			`mocked_sort.return_value = (None, torch.tensor([0]))`
			`embedder = TransformersSimilarityRanker(`
			`model_name_or_path="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n"`
			`)`
			`embedder.model = MagicMock()`
			`embedder.tokenizer = MagicMock()`

			`documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)]`

			`embedder.run(query="test", documents=documents)`

			`embedder.tokenizer.assert_called_once_with(`
			`[`
			`["test", "meta_value 0\ndocument number 0"],`
			`["test", "meta_value 1\ndocument number 1"],`
			`["test", "meta_value 2\ndocument number 2"],`
			`["test", "meta_value 3\ndocument number 3"],`
			`["test", "meta_value 4\ndocument number 4"],`
			`],`
			`padding=True,`
			`truncation=True,`
			`return_tensors="pt",`
			`)`

feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`@pytest.mark.integration`
			`@pytest.mark.parametrize(`
			`"query,docs_before_texts,expected_first_text",`
			`[`
			`("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"),`
			`("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"),`
			`("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"),`
			`],`
			`)`
			`def test_run(self, query, docs_before_texts, expected_first_text):`
			`"""`
			`Test if the component ranks documents correctly.`
			`"""`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`ranker.warm_up()`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`docs_before = [Document(content=text) for text in docs_before_texts]`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`output = ranker.run(query=query, documents=docs_before)`
			`docs_after = output["documents"]`

			`assert len(docs_after) == 3`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`assert docs_after[0].content == expected_first_text`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00
			`sorted_scores = sorted([doc.score for doc in docs_after], reverse=True)`
			`assert [doc.score for doc in docs_after] == sorted_scores`

			`# Returns an empty list if no documents are provided`
			`@pytest.mark.integration`
			`def test_returns_empty_list_if_no_documents_are_provided(self):`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`sampler = TransformersSimilarityRanker()`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00			`sampler.warm_up()`
			`output = sampler.run(query="City in Germany", documents=[])`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`assert not output["documents"]`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00
			`# Raises ComponentError if model is not warmed up`
			`@pytest.mark.integration`
			`def test_raises_component_error_if_model_not_warmed_up(self):`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`sampler = TransformersSimilarityRanker()`
feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker 2023-10-06 16:01:34 +02:00
			`with pytest.raises(ComponentError):`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`sampler.run(query="query", documents=[Document(content="document")])`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00
			`@pytest.mark.integration`
			`@pytest.mark.parametrize(`
			`"query,docs_before_texts,expected_first_text",`
			`[`
			`("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"),`
			`("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"),`
			`("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"),`
			`],`
			`)`
			`def test_run_top_k(self, query, docs_before_texts, expected_first_text):`
			`"""`
			`Test if the component ranks documents correctly with a custom top_k.`
			`"""`
refactor!: rename `SimilarityRanker` to `TransformersSimilarityRanker` (#6100) * rename * release note * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * Update haystack/preview/components/rankers/transformers_similarity.py Co-authored-by: Domenico <domenico.cinque98@gmail.com> * fix test --------- Co-authored-by: Domenico <domenico.cinque98@gmail.com> 2023-10-24 19:45:16 +02:00			`ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2", top_k=2)`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`ranker.warm_up()`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`docs_before = [Document(content=text) for text in docs_before_texts]`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00			`output = ranker.run(query=query, documents=docs_before)`
			`docs_after = output["documents"]`

			`assert len(docs_after) == 2`
refactor: Rename `Document`'s `text` field to `content` (#6181) * Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> 2023-10-31 12:44:04 +01:00			`assert docs_after[0].content == expected_first_text`
Add top_k to SimilarityRanker (#6036) 2023-10-12 13:52:01 +02:00
			`sorted_scores = sorted([doc.score for doc in docs_after], reverse=True)`
			`assert [doc.score for doc in docs_after] == sorted_scores`
fix: Make `TransformersSimilarityRanker` run with single document list (#6503) * Make `TransformersSimilarityRanker` run with single document list * Add release note * Remove unused import in test 2023-12-08 16:18:46 +01:00
			`@pytest.mark.integration`
			`def test_run_single_document(self):`
			`"""`
			`Test if the component runs with a single document.`
			`"""`
			`ranker = TransformersSimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")`
			`ranker.warm_up()`
			`docs_before = [Document(content="Berlin")]`
			`output = ranker.run(query="City in Germany", documents=docs_before)`
			`docs_after = output["documents"]`

			`assert len(docs_after) == 1`