test: mark more tests as slow (#9296)

* test: mark tests as slow * alphabetical order; install xet * revert pyproject * Trigger Build * simplify tests as suggested * add comment to workflow
2025-12-28 23:48:53 +00:00 · 2025-04-24 10:25:13 +02:00 · 2025-04-24 10:25:13 +02:00 · e3d4e21237
commit e3d4e21237
parent df662daaef
15 changed files with 89 additions and 64 deletions
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -42,6 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
+    # Specifying outputs is not needed to make the job work, but only to comply with actionlint
    outputs:
      changes: ${{ steps.changes.outputs.changes }}
    steps:
@ -54,9 +55,38 @@ jobs:
          filters: |
            changes:
              - "haystack/components/audio/whisper_local.py"
+              - "haystack/components/classifiers/zero_shot_document_classifier.py"
              - "haystack/components/converters/tika.py"
-              - "test/components/converters/test_tika_doc_converter.py"
+              - "haystack/components/embedders/hugging_face_api_document_embedder.py"
+              - "haystack/components/embedders/hugging_face_api_text_embedder.py"
+              - "haystack/components/embedders/sentence_transformers_text_embedder.py"
+              - "haystack/components/evaluators/sas_evaluator.py"
+              - "haystack/components/generators/chat/hugging_face_api.py"
+              - "haystack/components/generators/chat/hugging_face_local.py"
+              - "haystack/components/generators/hugging_face_api.py"
+              - "haystack/components/generators/hugging_face_local_generator.py"
+              - "haystack/components/rankers/sentence_transformers_diversity.py"
+              - "haystack/components/rankers/transformers_similarity.py"
+              - "haystack/components/readers/extractive.py"
+              - "haystack/components/routers/transformers_text_router.py"
+              - "haystack/components/routers/zero_shot_text_router.py"
+
              - "test/components/audio/test_whisper_local.py"
+              - "test/components/classifiers/test_zero_shot_document_classifier.py"
+              - "test/components/converters/test_tika_doc_converter.py"
+              - "test/components/embedders/test_hugging_face_api_document_embedder.py"
+              - "test/components/embedders/test_hugging_face_api_text_embedder.py"
+              - "test/components/embedders/test_sentence_transformers_text_embedder.py"
+              - "test/components/evaluators/test_sas_evaluator.py"
+              - "test/components/generators/chat/test_hugging_face_api.py"
+              - "test/components/generators/chat/test_hugging_face_local.py"
+              - "test/components/generators/test_hugging_face_api.py"
+              - "test/components/generators/test_hugging_face_local_generator.py"
+              - "test/components/rankers/test_sentence_transformers_diversity.py"
+              - "test/components/rankers/test_transformers_similarity.py"
+              - "test/components/readers/test_extractive.py"
+              - "test/components/routers/test_transformers_text_router.py"
+              - "test/components/routers/test_zero_shot_text_router.py"

  slow-integration-tests:
    name: Slow Tests / ${{ matrix.os }}
--- a/test/components/classifiers/test_zero_shot_document_classifier.py
+++ b/test/components/classifiers/test_zero_shot_document_classifier.py
@ -137,6 +137,7 @@ class TestTransformersZeroShotDocumentClassifier:
        assert result["documents"][1].to_dict()["classification"]["label"] == "negative"

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        component = TransformersZeroShotDocumentClassifier(
--- a/test/components/embedders/test_hugging_face_api_document_embedder.py
+++ b/test/components/embedders/test_hugging_face_api_document_embedder.py
@ -371,6 +371,7 @@ class TestHuggingFaceAPIDocumentEmbedder:

    @pytest.mark.flaky(reruns=5, reruns_delay=5)
    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
--- a/test/components/embedders/test_hugging_face_api_text_embedder.py
+++ b/test/components/embedders/test_hugging_face_api_text_embedder.py
@ -215,6 +215,7 @@ class TestHuggingFaceAPITextEmbedder:

    @pytest.mark.flaky(reruns=5, reruns_delay=5)
    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
@ -238,6 +239,7 @@ class TestHuggingFaceAPITextEmbedderAsync:

    @pytest.mark.integration
    @pytest.mark.asyncio
+    @pytest.mark.slow
    @pytest.mark.skipif(os.environ.get("HF_API_TOKEN", "") == "", reason="HF_API_TOKEN is not set")
    async def test_run_async_with_real_api(self):
        """
@ -289,6 +291,7 @@ class TestHuggingFaceAPITextEmbedderAsync:

    @pytest.mark.integration
    @pytest.mark.asyncio
+    @pytest.mark.slow
    @pytest.mark.skipif(os.environ.get("HF_API_TOKEN", "") == "", reason="HF_API_TOKEN is not set")
    async def test_run_async_concurrent_requests(self):
        """
--- a/test/components/embedders/test_sentence_transformers_text_embedder.py
+++ b/test/components/embedders/test_sentence_transformers_text_embedder.py
@ -267,6 +267,7 @@ class TestSentenceTransformersTextEmbedder:
            embedder.run(text=list_integers_input)

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_trunc(self, monkeypatch):
        """
        sentence-transformers/paraphrase-albert-small-v2 maps sentences & paragraphs to a 768 dimensional dense vector space
@ -289,6 +290,7 @@ class TestSentenceTransformersTextEmbedder:
        assert len(embedding_trunc) == 128

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_quantization(self):
        """
        sentence-transformers/paraphrase-albert-small-v2 maps sentences & paragraphs to a 768 dimensional dense vector space
--- a/test/components/evaluators/test_sas_evaluator.py
+++ b/test/components/evaluators/test_sas_evaluator.py
@ -104,6 +104,7 @@ class TestSASEvaluator:
            evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_with_matching_predictions(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        evaluator = SASEvaluator()
@ -125,6 +126,7 @@ class TestSASEvaluator:
        assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_with_single_prediction(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        evaluator = SASEvaluator()
@ -139,6 +141,7 @@ class TestSASEvaluator:
        assert result["individual_scores"] == pytest.approx([0.689089], abs=1e-5)

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_with_mismatched_predictions(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        evaluator = SASEvaluator()
@ -159,6 +162,7 @@ class TestSASEvaluator:
        assert result["individual_scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_with_bi_encoder_model(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
@ -179,6 +183,7 @@ class TestSASEvaluator:
        assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_with_cross_encoder_model(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
--- a/test/components/generators/chat/test_hugging_face_api.py
+++ b/test/components/generators/chat/test_hugging_face_api.py
@ -574,6 +574,7 @@ class TestHuggingFaceAPIChatGenerator:
        }

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
@ -602,6 +603,7 @@ class TestHuggingFaceAPIChatGenerator:
        assert "completion_tokens" in response["replies"][0].meta["usage"]

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
@ -635,6 +637,7 @@ class TestHuggingFaceAPIChatGenerator:
        assert "completion_tokens" in response_meta["usage"]

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
@ -829,6 +832,7 @@ class TestHuggingFaceAPIChatGenerator:
        }

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@ -345,6 +345,7 @@ class TestHuggingFaceLocalChatGenerator:
        mock_convert.assert_any_call(messages[1])

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.flaky(reruns=3, reruns_delay=10)
    def test_live_run(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
--- a/test/components/generators/test_hugging_face_api.py
+++ b/test/components/generators/test_hugging_face_api.py
@ -291,6 +291,7 @@ class TestHuggingFaceAPIGenerator:

    @pytest.mark.flaky(reruns=5, reruns_delay=5)
    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
@ -322,6 +323,7 @@ class TestHuggingFaceAPIGenerator:

    @pytest.mark.flaky(reruns=5, reruns_delay=5)
    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.skipif(
        not os.environ.get("HF_API_TOKEN", None),
        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
--- a/test/components/generators/test_hugging_face_local_generator.py
+++ b/test/components/generators/test_hugging_face_local_generator.py
@ -454,6 +454,7 @@ class TestHuggingFaceLocalGenerator:
        assert criteria(generated_text_ids, scores=None) is True

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_hf_pipeline_runs_with_our_criteria(self, monkeypatch):
        """Test that creating our own StopWordsCriteria and passing it to a Huggingface pipeline works."""
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
@ -467,6 +468,7 @@ class TestHuggingFaceLocalGenerator:

    @pytest.mark.integration
    @pytest.mark.flaky(reruns=3, reruns_delay=10)
+    @pytest.mark.slow
    def test_live_run(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        llm = HuggingFaceLocalGenerator(model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50})
--- a/test/components/rankers/test_sentence_transformers_diversity.py
+++ b/test/components/rankers/test_sentence_transformers_diversity.py
@ -577,6 +577,7 @@ class TestSentenceTransformersDiversityRanker:
        assert Pipeline.loads(pipe_serialized) == pipe

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
    def test_run(self, similarity, monkeypatch):
        """
@ -605,6 +606,7 @@ class TestSentenceTransformersDiversityRanker:
        assert ranked_order == expected_order

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
    def test_run_real_world_use_case(self, similarity, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
@ -678,6 +680,7 @@ class TestSentenceTransformersDiversityRanker:
        assert result_content == expected_content

    @pytest.mark.integration
+    @pytest.mark.slow
    @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
    def test_run_with_maximum_margin_relevance_strategy(self, similarity, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
--- a/test/components/rankers/test_transformers_similarity.py
+++ b/test/components/rankers/test_transformers_similarity.py
@ -336,35 +336,19 @@ class TestSimilarityRanker:
        assert ranker.device == ComponentDevice.from_multiple(DeviceMap.from_hf({"layer_1": 1, "classifier": "cpu"}))

    @pytest.mark.integration
-    @pytest.mark.parametrize(
-        "query,docs_before_texts,expected_first_text,scores",
-        [
-            (
-                "City in Bosnia and Herzegovina",
-                ["Berlin", "Belgrade", "Sarajevo"],
-                "Sarajevo",
-                [2.2864143829792738e-05, 0.00012495707778725773, 0.009869757108390331],
-            ),
-            (
-                "Machine learning",
-                ["Python", "Bakery in Paris", "Tesla Giga Berlin"],
-                "Python",
-                [1.9063229046878405e-05, 1.434577916370472e-05, 1.3049247172602918e-05],
-            ),
-            (
-                "Cubist movement",
-                ["Nirvana", "Pablo Picasso", "Coffee"],
-                "Pablo Picasso",
-                [1.3313065210240893e-05, 9.90335684036836e-05, 1.3518535524781328e-05],
-            ),
-        ],
-    )
-    def test_run(self, query, docs_before_texts, expected_first_text, scores):
+    @pytest.mark.slow
+    def test_run(self):
        """
        Test if the component ranks documents correctly.
        """
        ranker = TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
        ranker.warm_up()
+
+        query = "City in Bosnia and Herzegovina"
+        docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"]
+        expected_first_text = "Sarajevo"
+        expected_scores = [2.2864143829792738e-05, 0.00012495707778725773, 0.009869757108390331]
+
        docs_before = [Document(content=text) for text in docs_before_texts]
        output = ranker.run(query=query, documents=docs_before)
        docs_after = output["documents"]
@ -372,41 +356,25 @@ class TestSimilarityRanker:
        assert len(docs_after) == 3
        assert docs_after[0].content == expected_first_text

-        sorted_scores = sorted(scores, reverse=True)
+        sorted_scores = sorted(expected_scores, reverse=True)
        assert docs_after[0].score == pytest.approx(sorted_scores[0], abs=1e-6)
        assert docs_after[1].score == pytest.approx(sorted_scores[1], abs=1e-6)
        assert docs_after[2].score == pytest.approx(sorted_scores[2], abs=1e-6)

    @pytest.mark.integration
-    @pytest.mark.parametrize(
-        "query,docs_before_texts,expected_first_text,scores",
-        [
-            (
-                "City in Bosnia and Herzegovina",
-                ["Berlin", "Belgrade", "Sarajevo"],
-                "Sarajevo",
-                [2.2864143829792738e-05, 0.00012495707778725773, 0.009869757108390331],
-            ),
-            (
-                "Machine learning",
-                ["Python", "Bakery in Paris", "Tesla Giga Berlin"],
-                "Python",
-                [1.9063229046878405e-05, 1.434577916370472e-05, 1.3049247172602918e-05],
-            ),
-            (
-                "Cubist movement",
-                ["Nirvana", "Pablo Picasso", "Coffee"],
-                "Pablo Picasso",
-                [1.3313065210240893e-05, 9.90335684036836e-05, 1.3518535524781328e-05],
-            ),
-        ],
-    )
-    def test_run_small_batch_size(self, query, docs_before_texts, expected_first_text, scores):
+    @pytest.mark.slow
+    def test_run_small_batch_size(self):
        """
        Test if the component ranks documents correctly.
        """
        ranker = TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2", batch_size=2)
        ranker.warm_up()
+
+        query = "City in Bosnia and Herzegovina"
+        docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"]
+        expected_first_text = "Sarajevo"
+        expected_scores = [2.2864143829792738e-05, 0.00012495707778725773, 0.009869757108390331]
+
        docs_before = [Document(content=text) for text in docs_before_texts]
        output = ranker.run(query=query, documents=docs_before)
        docs_after = output["documents"]
@ -414,41 +382,37 @@ class TestSimilarityRanker:
        assert len(docs_after) == 3
        assert docs_after[0].content == expected_first_text

-        sorted_scores = sorted(scores, reverse=True)
+        sorted_scores = sorted(expected_scores, reverse=True)
        assert docs_after[0].score == pytest.approx(sorted_scores[0], abs=1e-6)
        assert docs_after[1].score == pytest.approx(sorted_scores[1], abs=1e-6)
        assert docs_after[2].score == pytest.approx(sorted_scores[2], abs=1e-6)

-    #  Returns an empty list if no documents are provided
-    @pytest.mark.integration
    def test_returns_empty_list_if_no_documents_are_provided(self):
        sampler = TransformersSimilarityRanker()
-        sampler.warm_up()
+        sampler.model = MagicMock()
+
        output = sampler.run(query="City in Germany", documents=[])
        assert not output["documents"]

    #  Raises ComponentError if model is not warmed up
-    @pytest.mark.integration
    def test_raises_component_error_if_model_not_warmed_up(self):
        sampler = TransformersSimilarityRanker()
        with pytest.raises(RuntimeError):
            sampler.run(query="query", documents=[Document(content="document")])

    @pytest.mark.integration
-    @pytest.mark.parametrize(
-        "query,docs_before_texts,expected_first_text",
-        [
-            ("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"),
-            ("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"),
-            ("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"),
-        ],
-    )
-    def test_run_top_k(self, query, docs_before_texts, expected_first_text):
+    @pytest.mark.slow
+    def test_run_top_k(self):
        """
        Test if the component ranks documents correctly with a custom top_k.
        """
        ranker = TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_k=2)
        ranker.warm_up()
+
+        query = "City in Bosnia and Herzegovina"
+        docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"]
+        expected_first_text = "Sarajevo"
+
        docs_before = [Document(content=text) for text in docs_before_texts]
        output = ranker.run(query=query, documents=docs_before)
        docs_after = output["documents"]
@ -460,6 +424,7 @@ class TestSimilarityRanker:
        assert [doc.score for doc in docs_after] == sorted_scores

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run_single_document(self):
        """
        Test if the component runs with a single document.
--- a/test/components/readers/test_extractive.py
+++ b/test/components/readers/test_extractive.py
@ -776,6 +776,7 @@ class TestDeduplication:


@pytest.mark.integration
+@pytest.mark.slow
 def test_t5(monkeypatch):
    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
    reader = ExtractiveReader("sjrhuschlee/flan-t5-base-squad2")
@ -801,6 +802,7 @@ def test_t5(monkeypatch):


@pytest.mark.integration
+@pytest.mark.slow
 def test_roberta(monkeypatch):
    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
    reader = ExtractiveReader("deepset/tinyroberta-squad2")
@ -831,6 +833,7 @@ def test_roberta(monkeypatch):


@pytest.mark.integration
+@pytest.mark.slow
 def test_matches_hf_pipeline(monkeypatch):
    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
    reader = ExtractiveReader(
--- a/test/components/routers/test_transformers_text_router.py
+++ b/test/components/routers/test_transformers_text_router.py
@ -172,6 +172,7 @@ class TestTransformersTextRouter:
        assert out == {"en": "What is the color of the sky?"}

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection")
@ -203,6 +204,7 @@ class TestTransformersTextRouter:
        assert out == {"en": "What is the color of the sky?"}

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_wrong_labels(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection", labels=["en", "de"])
--- a/test/components/routers/test_zero_shot_text_router.py
+++ b/test/components/routers/test_zero_shot_text_router.py
@ -106,6 +106,7 @@ class TestTransformersZeroShotTextRouter:
        assert out == {"query": "What is the color of the sky?"}

    @pytest.mark.integration
+    @pytest.mark.slow
    def test_run(self, monkeypatch):
        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
        router = TransformersZeroShotTextRouter(labels=["query", "passage"])