test: move several modeling tests in e2e/ (#4308)

* no dpr test seems worth mocking * move distillation tests * pylint * mypy * pylint * move feature_extraction tests as well * move feature_extraction tests as well * merge feature extractor suites * get_language_model tests and adaptive model tests * duplicate test * moving fixtures * mypy * mypy-again * trigger * un-mock integration test * review feedback * feedback * pylint
2026-01-05 11:38:20 +00:00 · 2023-04-28 17:08:41 +02:00 · 2023-04-28 17:08:41 +02:00 · b60d9a2cbf
commit b60d9a2cbf
parent 5a17a40685
10 changed files with 172 additions and 438 deletions
--- a/e2e/conftest.py
+++ b/e2e/conftest.py
@ -1,15 +1,12 @@
 import os
 import uuid
 from contextlib import contextmanager
-import random
 from pathlib import Path

-import torch
-import numpy as np
-
 import pytest

 from haystack.schema import Document
+from haystack.modeling.utils import set_all_seeds
 from haystack.document_stores import (
    InMemoryDocumentStore,
    ElasticsearchDocumentStore,
@ -21,10 +18,7 @@ from haystack.document_stores import (
 )


-# Fix all random seeds that come to mind
-torch.manual_seed(0)
-np.random.seed(0)
-random.seed(0)
+set_all_seeds(0)


@pytest.fixture
--- a/e2e/modeling/init.py
+++ b/e2e/modeling/init.py
--- a/e2e/modeling/test_adaptive_qa_inference.py
+++ b/e2e/modeling/test_adaptive_qa_inference.py
@ -1,9 +1,10 @@
 import pytest

+from haystack.modeling.infer import Inferencer
+

@pytest.mark.parametrize("multiprocessing_chunksize", [None, 2])
-@pytest.mark.parametrize("num_processes", [2, 0, None], scope="module")
-def test_qa_format_and_results(adaptive_model_qa, multiprocessing_chunksize):
+def test_qa_format_and_results(multiprocessing_chunksize):
    qa_inputs_dicts = [
        {
            "questions": ["In what country is Normandy"],
@ -22,9 +23,13 @@ def test_qa_format_and_results(adaptive_model_qa, multiprocessing_chunksize):
    ]
    ground_truths = ["France", "GameTrailers"]

+    adaptive_model_qa = Inferencer.load(
+        "deepset/bert-medium-squad2-distilled", task_type="question_answering", batch_size=16, gpu=False
+    )
    results = adaptive_model_qa.inference_from_dicts(
        dicts=qa_inputs_dicts, multiprocessing_chunksize=multiprocessing_chunksize
    )
+
    # sample results
    # [
    #     {
@ -69,7 +74,3 @@ def test_qa_format_and_results(adaptive_model_qa, multiprocessing_chunksize):
            "offset_context_end",
            "document_id",
        } == answer.keys()
-
-
-if __name__ == "__main__":
-    test_qa_format_and_results()
--- a/test/modeling/test_distillation.py
+++ b/test/modeling/test_distillation.py
@ -1,7 +1,7 @@
-from pathlib import Path
+import torch
+
 from haystack.nodes import FARMReader
 from haystack.modeling.data_handler.processor import UnlabeledTextProcessor
-import torch


 def create_checkpoint(model):
--- a/test/modeling/test_dpr.py
+++ b/test/modeling/test_dpr.py
@ -1,7 +1,4 @@
-import os
-from typing import Tuple
-
-import logging
+from typing import Dict, Any
 from pathlib import Path

 import numpy as np
@ -9,7 +6,7 @@ import pytest
 import torch
 from torch.utils.data import SequentialSampler
 from tqdm import tqdm
-from transformers import DPRQuestionEncoder, AutoTokenizer
+from transformers import AutoTokenizer

 from haystack.modeling.data_handler.dataloader import NamedDataLoader
 from haystack.modeling.data_handler.processor import TextSimilarityProcessor
@ -17,19 +14,10 @@ from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
 from haystack.modeling.model.language_model import get_language_model, DPREncoder
 from haystack.modeling.model.prediction_head import TextSimilarityHead

-from haystack.nodes.retriever.dense import DensePassageRetriever
-
-from haystack.modeling.utils import set_all_seeds, initialize_device_settings
-from haystack.utils.early_stopping import EarlyStopping
+from haystack.modeling.utils import initialize_device_settings


-def test_dpr_modules(caplog=None):
-    if caplog:
-        caplog.set_level(logging.CRITICAL)
-
-    set_all_seeds(seed=42)
-    devices, n_gpu = initialize_device_settings(use_cuda=True)
-
+def test_dpr_modules():
    # 1.Create question and passage tokenizers
    query_tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True
@ -66,6 +54,7 @@ def test_dpr_modules(caplog=None):

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

+    devices, _ = initialize_device_settings(use_cuda=True)
    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
@ -110,7 +99,9 @@ def test_dpr_modules(caplog=None):
        ],
    }

-    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
+    dataset, tensor_names, _ = processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
    features = {key: val.unsqueeze(0).to(devices[0]) for key, val in zip(tensor_names, dataset[0])}

    # test features
@ -322,6 +313,13 @@ def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_ha
                    "label": "hard_negative",
                    "external_id": "3643705",
                },
+                # Empty title
+                {
+                    "title": "",
+                    "text": "Director Radio Iași); Dragoș-Liviu Vîlceanu; Mihnea-Adrian Vîlceanu; Nathalie-Teona",
+                    "label": "positive",
+                    "external_id": "b21eaeff-e08b-4548-b5e0-a280f6f4efef",
+                },
            ],
        },
        {
@ -382,7 +380,7 @@ def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_ha
    )

    for i, d in enumerate(dict):
-        dataset, tensor_names, _, baskets = processor.dataset_from_dicts(dicts=[d], return_baskets=True)
+        __, ___, _, baskets = processor.dataset_from_dicts(dicts=[d], return_baskets=True)
        feat = baskets[0].samples[0].features
        assert torch.all(torch.eq(torch.tensor(feat[0]["query_input_ids"][:10]), query_input_ids[i]))
        assert len(torch.tensor(feat[0]["query_segment_ids"]).nonzero()) == 0
@ -404,42 +402,6 @@ def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_ha
        assert len(torch.tensor(feat[0]["passage_segment_ids"]).nonzero()) == 0


-@pytest.mark.parametrize("use_fast", [False])
-@pytest.mark.parametrize("embed_title", [True, False])
-def test_dpr_processor_empty_title(use_fast, embed_title):
-    dict = {
-        "query": "what is a cat?",
-        "passages": [
-            {
-                "title": "",
-                "text": "Director Radio Iași); Dragoș-Liviu Vîlceanu; Mihnea-Adrian Vîlceanu; Nathalie-Teona",
-                "label": "positive",
-                "external_id": "b21eaeff-e08b-4548-b5e0-a280f6f4efef",
-            }
-        ],
-    }
-
-    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-    query_tokenizer = AutoTokenizer.from_pretrained(query_tok, use_fast=use_fast)
-    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-    passage_tokenizer = AutoTokenizer.from_pretrained(passage_tok, use_fast=use_fast)
-    processor = TextSimilarityProcessor(
-        query_tokenizer=query_tokenizer,
-        passage_tokenizer=passage_tokenizer,
-        max_seq_len_query=256,
-        max_seq_len_passage=256,
-        data_dir="data/retriever",
-        train_filename="nq-train.json",
-        test_filename="nq-dev.json",
-        embed_title=embed_title,
-        num_hard_negatives=1,
-        label_list=["hard_negative", "positive"],
-        metric="text_similarity_metric",
-        shuffle_negatives=False,
-    )
-    _ = processor.dataset_from_dicts(dicts=[dict])
-
-
 def test_dpr_problematic():
    erroneous_dicts = [
        {
@ -523,9 +485,7 @@ def test_dpr_problematic():
        shuffle_negatives=False,
    )

-    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
-        dicts=erroneous_dicts, return_baskets=True
-    )
+    _, __, problematic_ids, ___ = processor.dataset_from_dicts(dicts=erroneous_dicts, return_baskets=True)
    assert problematic_ids == {0, 1}


@ -554,9 +514,7 @@ def test_dpr_query_only():
        shuffle_negatives=False,
    )

-    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
-        dicts=erroneous_dicts, return_baskets=True
-    )
+    _, tensor_names, problematic_ids, __ = processor.dataset_from_dicts(dicts=erroneous_dicts, return_baskets=True)
    assert len(problematic_ids) == 0
    assert tensor_names == ["query_input_ids", "query_segment_ids", "query_attention_mask"]

@ -616,9 +574,7 @@ def test_dpr_context_only():
        shuffle_negatives=False,
    )

-    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
-        dicts=erroneous_dicts, return_baskets=True
-    )
+    _, tensor_names, problematic_ids, __ = processor.dataset_from_dicts(dicts=erroneous_dicts, return_baskets=True)
    assert len(problematic_ids) == 0
    assert tensor_names == ["passage_input_ids", "passage_segment_ids", "passage_attention_mask", "label_ids"]

@ -668,9 +624,11 @@ def test_dpr_processor_save_load(tmp_path):
    )
    save_dir = f"{tmp_path}/testsave/dpr_processor"
    processor.save(save_dir=save_dir)
-    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
+    dataset, __, _ = processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
    loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=save_dir)
-    dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False)
+    dataset2, __, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False)
    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])


@ -688,7 +646,7 @@ def test_dpr_processor_save_load(tmp_path):
        {"query": "facebook/dpr-question_encoder-single-nq-base", "passage": "facebook/dpr-ctx_encoder-single-nq-base"},
    ],
 )
-def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_passage_model: Tuple[str, str]):
+def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_passage_model: Dict[str, str]):
    """
    This test compares 1) a model that was loaded from model hub with
    2) a model from model hub that was saved to disk and then loaded from disk and
@ -809,22 +767,26 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
    loaded_model.connect_heads_with_processor(loaded_processor.tasks, require_labels=False)

    # compare model loaded from model hub with model loaded from disk
-    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
-    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
+    dataset, tensor_names, _ = processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
+    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])

    # generate embeddings with model loaded from model hub
-    dataset, tensor_names, _, baskets = processor.dataset_from_dicts(
+    dataset, tensor_names, _, __ = processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset, sampler=SequentialSampler(dataset), batch_size=16, tensor_names=tensor_names
    )
-    all_embeddings = {"query": [], "passages": []}
+    all_embeddings: Dict[str, Any] = {"query": [], "passages": []}
    model.eval()

-    for batch in tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True):
+    for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
@ -848,17 +810,17 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
        all_embeddings["query"] = np.concatenate(all_embeddings["query"])

    # generate embeddings with model loaded from disk
-    dataset2, tensor_names2, _, baskets2 = loaded_processor.dataset_from_dicts(
+    dataset2, tensor_names2, _, __ = loaded_processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset2, sampler=SequentialSampler(dataset2), batch_size=16, tensor_names=tensor_names2
    )
-    all_embeddings2 = {"query": [], "passages": []}
+    all_embeddings2: Dict[str, Any] = {"query": [], "passages": []}
    loaded_model.eval()

-    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
+    for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
@ -932,22 +894,26 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

    # compare a model loaded from disk that originated from the model hub and was then saved disk with
    # a model loaded from disk that also originated from a FARM style model that was saved to disk
-    dataset3, tensor_names3, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
-    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
+    dataset3, tensor_names3, _ = processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
+    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(  # pylint: disable=unbalanced-tuple-unpacking
+        dicts=[d], return_baskets=False
+    )
    assert np.array_equal(dataset3.tensors[0], dataset2.tensors[0])

    # generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
-    dataset3, tensor_names3, _, baskets3 = loaded_processor.dataset_from_dicts(
+    dataset3, tensor_names3, _, __ = loaded_processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset3, sampler=SequentialSampler(dataset3), batch_size=16, tensor_names=tensor_names3
    )
-    all_embeddings3 = {"query": [], "passages": []}
+    all_embeddings3: Dict[str, Any] = {"query": [], "passages": []}
    loaded_model.eval()

-    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
+    for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
@ -973,170 +939,3 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
    # compare embeddings of model loaded from model hub and model loaded from disk that originated from a FARM style
    # model that was saved to disk earlier
    assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])
-
-
-@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
-def test_dpr_training(document_store, tmp_path, samples_path):
-    retriever = DensePassageRetriever(
-        document_store=document_store,
-        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
-        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
-        max_seq_len_query=8,
-        max_seq_len_passage=8,
-    )
-
-    save_dir = f"{tmp_path}/test_dpr_training"
-    retriever.train(
-        data_dir=str(samples_path / "dpr"),
-        train_filename="sample.json",
-        dev_filename="sample.json",
-        test_filename="sample.json",
-        n_epochs=1,
-        batch_size=1,
-        grad_acc_steps=1,
-        save_dir=save_dir,
-        evaluate_every=10,
-        embed_title=True,
-        num_positives=1,
-        num_hard_negatives=1,
-    )
-
-
-@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
-def test_dpr_training_with_earlystopping(document_store, tmp_path, samples_path):
-    retriever = DensePassageRetriever(
-        document_store=document_store,
-        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
-        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
-        max_seq_len_query=8,
-        max_seq_len_passage=8,
-    )
-
-    save_dir = f"{tmp_path}/test_dpr_training"
-    retriever.train(
-        data_dir=str(samples_path / "dpr"),
-        train_filename="sample.json",
-        dev_filename="sample.json",
-        test_filename="sample.json",
-        n_epochs=1,
-        batch_size=1,
-        grad_acc_steps=1,
-        save_dir=save_dir,
-        evaluate_every=1,
-        embed_title=True,
-        num_positives=1,
-        num_hard_negatives=1,
-        early_stopping=EarlyStopping(save_dir=save_dir),
-    )
-
-
-# TODO fix CI errors (test pass locally or on AWS, next steps: isolate PyTorch versions once FARM dependency is removed)
-# def test_dpr_training():
-#     batch_size = 1
-#     n_epochs = 1
-#     distributed = False  # enable for multi GPU training via DDP
-#     evaluate_every = 1
-#     question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
-#     passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
-#     do_lower_case = True
-#     use_fast = True
-#     similarity_function = "dot_product"
-#
-#     device, n_gpu = initialize_device_settings(use_cuda=False)
-#
-#     query_tokenizer = get_tokenizer(pretrained_model_name_or_path=question_lang_model,
-#                                      do_lower_case=do_lower_case, use_fast=use_fast)
-#     passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_lang_model,
-#                                        do_lower_case=do_lower_case, use_fast=use_fast)
-#     label_list = ["hard_negative", "positive"]
-#
-#     processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
-#                                         passage_tokenizer=passage_tokenizer,
-#                                         max_seq_len_query=10,
-#                                         max_seq_len_passage=10,
-#                                         label_list=label_list,
-#                                         metric="text_similarity_metric",
-#                                         data_dir="samples/dpr/",
-#                                         train_filename="sample.json",
-#                                         dev_filename="sample.json",
-#                                         test_filename=None,
-#                                         embed_title=True,
-#                                         num_hard_negatives=1,
-#                                         dev_split=0,
-#                                         max_samples=2)
-#
-#     data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
-#
-#     question_language_model = get_language_model(pretrained_model_name_or_path=question_lang_model,
-#                                                  language_model_class="DPRQuestionEncoder")
-#     passage_language_model = get_language_model(pretrained_model_name_or_path=passage_lang_model,
-#                                                 language_model_class="DPRContextEncoder")
-#
-#     prediction_head = TextSimilarityHead(similarity_function=similarity_function)
-#
-#     model = BiAdaptiveModel(
-#         language_model1=question_language_model,
-#         language_model2=passage_language_model,
-#         prediction_heads=[prediction_head],
-#         embeds_dropout_prob=0.1,
-#         lm1_output_types=["per_sequence"],
-#         lm2_output_types=["per_sequence"],
-#         device=device,
-#     )
-#
-#     model, optimizer, lr_schedule = initialize_optimizer(
-#         model=model,
-#         learning_rate=1e-5,
-#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
-#                         "eps": 1e-08},
-#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
-#         n_batches=len(data_silo.loaders["train"]),
-#         n_epochs=n_epochs,
-#         grad_acc_steps=1,
-#         device=device,
-#         distributed=distributed
-#     )
-#
-#     trainer = Trainer(
-#         model=model,
-#         optimizer=optimizer,
-#         data_silo=data_silo,
-#         epochs=n_epochs,
-#         n_gpu=n_gpu,
-#         lr_schedule=lr_schedule,
-#         evaluate_every=evaluate_every,
-#         device=device,
-#     )
-#
-#     trainer.train()
-#
-#     ######## save and load model again
-#     save_dir = Path("testsave/dpr-model")
-#     model.save(save_dir)
-#     del model
-#
-#     model2 = BiAdaptiveModel.load(save_dir, device=device)
-#     model2, optimizer2, lr_schedule = initialize_optimizer(
-#         model=model2,
-#         learning_rate=1e-5,
-#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
-#                         "eps": 1e-08},
-#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
-#         n_batches=len(data_silo.loaders["train"]),
-#         n_epochs=n_epochs,
-#         grad_acc_steps=1,
-#         device=device,
-#         distributed=distributed
-#     )
-#     trainer2 = Trainer(
-#         model=model2,
-#         optimizer=optimizer,
-#         data_silo=data_silo,
-#         epochs=n_epochs,
-#         n_gpu=n_gpu,
-#         lr_schedule=lr_schedule,
-#         evaluate_every=evaluate_every,
-#         device=device,
-#     )
-#
-#     trainer2.train()
--- a/test/modeling/_test_feature_extraction_end2end.py
+++ b/test/modeling/_test_feature_extraction_end2end.py
@ -1,14 +1,9 @@
-from typing import Tuple
-
 import re

 import pytest
 import numpy as np
-from unittest.mock import MagicMock
-
 from tokenizers.pre_tokenizers import WhitespaceSplit

-import haystack
 from haystack.modeling.model.feature_extraction import FeatureExtractor


@ -19,7 +14,6 @@ XLNET = "xlnet-base-cased"
 TOKENIZERS_TO_TEST = [BERT, ROBERTA, XLNET]
 TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [(BERT, "##"), (ROBERTA, "Ġ"), (XLNET, "▁")]

-
 REGULAR_SENTENCE = "This is a sentence"
 GERMAN_SENTENCE = "Der entscheidende Pass"
 OTHER_ALPHABETS = "力加勝北区ᴵᴺᵀᵃছজটডণত"
@ -38,23 +32,6 @@ SENTENCE_WITH_TABS = "This is a sentence			with multiple tabs"
 SENTENCE_WITH_CUSTOM_TOKEN = "Let's see all on this text and. !23# neverseenwordspossible"


-class AutoTokenizer:
-    mocker: MagicMock = MagicMock()
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        cls.mocker.from_pretrained(*args, **kwargs)
-        return cls()
-
-
-@pytest.fixture(autouse=True)
-def mock_autotokenizer(request, monkeypatch):
-    # Do not patch integration tests
-    if "integration" in request.keywords:
-        return
-    monkeypatch.setattr(haystack.modeling.model.tokenization, "AutoTokenizer", AutoTokenizer)
-
-
 def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans):
    """
    Token offsets are originally relative to the beginning of the word
@ -69,80 +46,23 @@ def convert_offset_from_word_reference_to_text_reference(offsets, words, word_sp
    return token_offsets


-#
-# Unit tests
-#
-
-
-def test_get_tokenizer_str():
-    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name")
-    tokenizer.mocker.from_pretrained.assert_called_with(
-        pretrained_model_name_or_path="test-model-name", revision=None, use_fast=True, use_auth_token=None
-    )
-
-
-def test_get_tokenizer_path(tmp_path):
-    tokenizer = FeatureExtractor(pretrained_model_name_or_path=tmp_path / "test-path")
-    tokenizer.mocker.from_pretrained.assert_called_with(
-        pretrained_model_name_or_path=str(tmp_path / "test-path"), revision=None, use_fast=True, use_auth_token=None
-    )
-
-
-def test_get_tokenizer_keep_accents():
-    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-albert")
-    tokenizer.mocker.from_pretrained.assert_called_with(
-        pretrained_model_name_or_path="test-model-name-albert",
-        revision=None,
-        use_fast=True,
-        use_auth_token=None,
-        keep_accents=True,
-    )
-
-
-def test_get_tokenizer_mlm_warning(caplog):
-    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-mlm")
-    tokenizer.mocker.from_pretrained.assert_called_with(
-        pretrained_model_name_or_path="test-model-name-mlm", revision=None, use_fast=True, use_auth_token=None
-    )
-    assert "MLM part of codebert is currently not supported in Haystack".lower() in caplog.text.lower()
-
-
-#
-# Integration tests
-#
-
-
-@pytest.mark.integration
@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST)
 def test_save_load(tmp_path, model_name: str):
    tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False)
    text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

-    tokenizer.add_tokens(new_tokens=["neverseentokens"])
-    original_encoding = tokenizer(text)
+    tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
+    original_encoding = tokenizer.feature_extractor(text)

    save_dir = tmp_path / "saved_tokenizer"
-    tokenizer.save_pretrained(save_dir)
+    tokenizer.feature_extractor.save_pretrained(save_dir)

    tokenizer_loaded = FeatureExtractor(pretrained_model_name_or_path=save_dir)
-    new_encoding = tokenizer_loaded(text)
+    new_encoding = tokenizer_loaded.feature_extractor(text)

    assert original_encoding == new_encoding


-@pytest.mark.integration
-def test_tokenize_custom_vocab_bert():
-    tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
-    tokenizer.add_tokens(new_tokens=["neverseentokens"])
-    text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
-
-    tokenized = tokenizer.tokenize(text)
-    assert (
-        tokenized == f"Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split()
-    )
-
-
-@pytest.mark.integration
@pytest.mark.parametrize(
    "edge_case",
    [
@ -168,13 +88,14 @@ def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str,
    words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
    words = [x[0] for x in words_and_spans]

-    encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
-    expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split()))  # remove multiple whitespaces
+    encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
+    expected_tokenization = tokenizer.feature_extractor.tokenize(
+        " ".join(edge_case.split())
+    )  # remove multiple whitespaces

    assert encoded.tokens == expected_tokenization


-@pytest.mark.integration
@pytest.mark.parametrize("edge_case", [SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE])
@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA])
 def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str):
@ -188,13 +109,14 @@ def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exception
    words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
    words = [x[0] for x in words_and_spans]

-    encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
-    expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split()))  # remove multiple whitespaces
+    encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
+    expected_tokenization = tokenizer.feature_extractor.tokenize(
+        " ".join(edge_case.split())
+    )  # remove multiple whitespaces

    assert encoded.tokens == expected_tokenization


-@pytest.mark.integration
@pytest.mark.parametrize(
    "edge_case",
    [
@ -218,7 +140,7 @@ def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str,
    words = [x[0] for x in words_and_spans]
    word_spans = [x[1] for x in words_and_spans]

-    encoded = tokenizer(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
+    encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]

    # subword-tokens have special chars depending on model type. To align with original text we get rid of them
    tokens = [token.replace(marker, "") for token in encoded.tokens]
@ -228,7 +150,6 @@ def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str,
        assert token == edge_case[start:end]


-@pytest.mark.integration
@pytest.mark.parametrize(
    "edge_case",
    [
@ -259,7 +180,6 @@ def test_detokenization_for_bert(edge_case):
    assert encoded.tokens == detokenized_tokens


-@pytest.mark.integration
 def test_encode_plus_for_bert():
    tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
    text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
@ -307,16 +227,17 @@ def test_encode_plus_for_bert():
    ]


-@pytest.mark.integration
 def test_tokenize_custom_vocab_bert():
    tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)

-    tokenizer.add_tokens(new_tokens=["neverseentokens"])
+    tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
    text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

-    tokenized = tokenizer.tokenize(text)
-
-    encoded = tokenizer(text, add_special_tokens=False).encodings[0]
+    tokenized = tokenizer.feature_extractor.tokenize(text)
+    assert (
+        tokenized == "Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split()
+    )
+    encoded = tokenizer.feature_extractor(text, add_special_tokens=False).encodings[0]
    offsets = [x[0] for x in encoded.offsets]
    start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0)

--- a/test/modeling/test_feature_extraction.py
+++ b/test/modeling/test_feature_extraction.py
@ -1,8 +1,8 @@
 import pytest
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock

 import haystack
-from haystack.modeling.model.feature_extraction import FeatureExtractor, FEATURE_EXTRACTORS
+from haystack.modeling.model.feature_extraction import FeatureExtractor


 class MockedAutoTokenizer:
@ -24,11 +24,8 @@ class MockedAutoConfig:
        return cls()


-@pytest.fixture(autouse=True)
-def mock_autotokenizer(request, monkeypatch):
-    # Do not patch integration tests
-    if "integration" in request.keywords:
-        return
+@pytest.fixture()
+def mock_autotokenizer(monkeypatch):
    monkeypatch.setattr(
        haystack.modeling.model.feature_extraction, "FEATURE_EXTRACTORS", {"mocked": MockedAutoTokenizer}
    )
@ -36,30 +33,42 @@ def mock_autotokenizer(request, monkeypatch):
    monkeypatch.setattr(haystack.modeling.model.feature_extraction, "AutoTokenizer", MockedAutoTokenizer)


-#
-# Unit tests
-#
-
-
-def test_init_str():
+@pytest.mark.unit
+def test_get_tokenizer_str(mock_autotokenizer):
    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name")
-
-    tokenizer.feature_extractor.mocker.from_pretrained.assert_called_with(
+    tokenizer.mocker.from_pretrained.assert_called_with(
        pretrained_model_name_or_path="test-model-name", revision=None, use_fast=True, use_auth_token=None
    )


-def test_init_path(tmp_path):
+@pytest.mark.unit
+def test_get_tokenizer_path(mock_autotokenizer, tmp_path):
    tokenizer = FeatureExtractor(pretrained_model_name_or_path=tmp_path / "test-path")
-
-    tokenizer.feature_extractor.mocker.from_pretrained.assert_called_with(
+    tokenizer.mocker.from_pretrained.assert_called_with(
        pretrained_model_name_or_path=str(tmp_path / "test-path"), revision=None, use_fast=True, use_auth_token=None
    )


-#
-# Integration tests
-#
+@pytest.mark.unit
+def test_get_tokenizer_keep_accents(mock_autotokenizer):
+    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-albert")
+    tokenizer.mocker.from_pretrained.assert_called_with(
+        pretrained_model_name_or_path="test-model-name-albert",
+        revision=None,
+        use_fast=True,
+        use_auth_token=None,
+        keep_accents=True,
+    )
+
+
+@pytest.mark.unit
+def test_get_tokenizer_mlm_warning(mock_autotokenizer, caplog):
+    tokenizer = FeatureExtractor(pretrained_model_name_or_path="test-model-name-mlm")
+    tokenizer.mocker.from_pretrained.assert_called_with(
+        pretrained_model_name_or_path="test-model-name-mlm", revision=None, use_fast=True, use_auth_token=None
+    )
+    assert "MLM part of codebert is currently not supported in Haystack".lower() in caplog.text.lower()
+

 FEATURE_EXTRACTORS_TO_TEST = ["bert-base-cased"]

--- a/test/modeling/test_language.py
+++ b/test/modeling/test_language.py
@ -1,34 +0,0 @@
-import pytest
-
-from haystack.modeling.model.language_model import get_language_model
-
-
-@pytest.mark.parametrize(
-    "pretrained_model_name_or_path, lm_class",
-    [
-        ("google/bert_uncased_L-2_H-128_A-2", "HFLanguageModel"),
-        ("google/electra-small-generator", "HFLanguageModelWithPooler"),
-        ("distilbert-base-uncased", "HFLanguageModelNoSegmentIds"),
-        ("deepset/bert-small-mm_retrieval-passage_encoder", "DPREncoder"),
-    ],
-)
-def test_basic_loading(pretrained_model_name_or_path, lm_class):
-    lm = get_language_model(pretrained_model_name_or_path)
-    mod = __import__("haystack.modeling.model.language_model", fromlist=[lm_class])
-    klass = getattr(mod, lm_class)
-    assert isinstance(lm, klass)
-
-
-def test_basic_loading_unknown_model():
-    with pytest.raises(OSError):
-        get_language_model("model_that_doesnt_exist")
-
-
-def test_basic_loading_with_empty_string():
-    with pytest.raises(ValueError):
-        get_language_model("")
-
-
-def test_basic_loading_invalid_params():
-    with pytest.raises(ValueError):
-        get_language_model(None)
--- a/test/modeling/test_model_loading.py
+++ b/test/modeling/test_model_loading.py
@ -0,0 +1,43 @@
+import pytest
+
+from haystack.modeling.model.language_model import (
+    get_language_model,
+    HFLanguageModel,
+    HFLanguageModelNoSegmentIds,
+    HFLanguageModelWithPooler,
+    DPREncoder,
+)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "pretrained_model_name_or_path, lm_class",
+    [
+        ("google/bert_uncased_L-2_H-128_A-2", HFLanguageModel),
+        ("google/electra-small-generator", HFLanguageModelWithPooler),
+        ("distilbert-base-uncased", HFLanguageModelNoSegmentIds),
+        ("deepset/bert-small-mm_retrieval-passage_encoder", DPREncoder),
+    ],
+)
+def test_basic_loading(pretrained_model_name_or_path, lm_class, monkeypatch):
+    monkeypatch.setattr(lm_class, "__init__", lambda self, *a, **k: None)
+    lm = get_language_model(pretrained_model_name_or_path)
+    assert isinstance(lm, lm_class)
+
+
+@pytest.mark.unit
+def test_basic_loading_unknown_model():
+    with pytest.raises(OSError):
+        get_language_model("model_that_doesnt_exist")
+
+
+@pytest.mark.unit
+def test_basic_loading_with_empty_string():
+    with pytest.raises(ValueError):
+        get_language_model("")
+
+
+@pytest.mark.unit
+def test_basic_loading_invalid_params():
+    with pytest.raises(ValueError):
+        get_language_model(None)
--- a/test/modeling/test_question_answering.py
+++ b/test/modeling/test_question_answering.py
@ -15,10 +15,20 @@ version and scores of 95% and 96 for the GameCube version. GameTrailers in their
 it one of the greatest games ever created."""


+@pytest.fixture
+def bert_base_squad2(request):
+    model = QAInferencer.load(
+        "deepset/minilm-uncased-squad2",
+        task_type="question_answering",
+        batch_size=4,
+        num_processes=0,
+        multithreading_rust=False,
+    )
+    return model
+
+
@pytest.fixture()
-def span_inference_result(bert_base_squad2, caplog=None):
-    if caplog:
-        caplog.set_level(logging.CRITICAL)
+def span_inference_result(bert_base_squad2):
    obj_input = [
        QAInput(
            doc_text=DOC_TEXT, questions=Question("Who counted the game among the best ever made?", uid="best_id_ever")
@ -35,11 +45,11 @@ def no_answer_inference_result(bert_base_squad2, caplog=None):
    obj_input = [
        QAInput(
            doc_text="""\
-                The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by 
-                Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, 
-                Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. 
-                The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest 
-                and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual 
+                The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by
+                Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana,
+                Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names.
+                The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest
+                and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual
                trees divided into 16,000 species.""",
            questions=Question(
                "The Amazon represents less than half of the planets remaining what?", uid="best_id_ever"
@ -199,8 +209,8 @@ def test_duplicate_answer_filtering(bert_base_squad2):
    qa_input = [
        {
            "questions": ["“In what country lies the Normandy?”"],
-            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") 
-                raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. 
+            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
+                raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
                The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France.""",
        }
    ]
@ -224,8 +234,8 @@ def test_no_duplicate_answer_filtering(bert_base_squad2):
    qa_input = [
        {
            "questions": ["“In what country lies the Normandy?”"],
-            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") 
-                    raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. 
+            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
+                    raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
                    The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France.""",
        }
    ]
@ -250,8 +260,8 @@ def test_range_duplicate_answer_filtering(bert_base_squad2):
    qa_input = [
        {
            "questions": ["“In what country lies the Normandy?”"],
-            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") 
-                    raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. 
+            "text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
+                    raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
                    The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France.""",
        }
    ]
@ -296,12 +306,3 @@ def test_qa_confidence():
    result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
    assert np.isclose(result.prediction[0].confidence, 0.990427553653717)
    assert result.prediction[0].answer == "GameTrailers"
-
-
-if __name__ == "__main__":
-    test_inference_different_inputs()
-    test_inference_objs()
-    test_duplicate_answer_filtering()
-    test_no_duplicate_answer_filtering()
-    test_range_duplicate_answer_filtering()
-    test_qa_confidence()