haystack/test/modeling/test_dpr.py

from typing import Tuple

import os
import logging
from pathlib import Path

import numpy as np
import pytest
import torch
from torch.utils.data import SequentialSampler
from tqdm import tqdm
from transformers import DPRQuestionEncoder

from haystack.modeling.data_handler.dataloader import NamedDataLoader
from haystack.modeling.data_handler.processor import TextSimilarityProcessor
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
from haystack.modeling.model.language_model import get_language_model, DPREncoder
from haystack.modeling.model.prediction_head import TextSimilarityHead
from haystack.modeling.model.tokenization import get_tokenizer
from haystack.modeling.utils import set_all_seeds, initialize_device_settings


def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    devices, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = get_tokenizer(
        pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True
    )
    passage_tokenizer = get_tokenizer(
        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True
    )

    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
    )

    question_language_model = DPREncoder(
        pretrained_model_name_or_path="bert-base-uncased",
        model_type="DPRQuestionEncoder",
        model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0},
    )
    passage_language_model = DPREncoder(
        pretrained_model_name_or_path="bert-base-uncased",
        model_type="DPRContextEncoder",
        model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0},
    )

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=devices[0],
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPREncoder
    assert type(passage_language_model) == DPREncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001

    d = {
        "query": "big little lies season 2 how many episodes",
        "passages": [
            {
                "title": "Big Little Lies (TV series)",
                "text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",
                "label": "positive",
                "external_id": "18768923",
            },
            {
                "title": "Little People, Big World",
                "text": 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
                "label": "hard_negative",
                "external_id": "7459116",
            },
            {
                "title": "Cormac McCarthy",
                "text": 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
                "label": "negative",
                "passage_id": "2145653",
            },
        ],
    }

    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    features = {key: val.unsqueeze(0).to(devices[0]) for key, val in zip(tensor_names, dataset[0])}

    # test features
    assert torch.all(
        torch.eq(
            features["query_input_ids"][0][:10].cpu(),
            torch.tensor([101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102]),
        )
    )
    assert torch.all(
        torch.eq(
            features["passage_input_ids"][0][0][:10].cpu(),
            torch.tensor([101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186]),
        )
    )
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10)))))
    assert torch.all(
        torch.eq(features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127))))
    )
    assert torch.all(
        torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143))))
    )

    features_query = {key.replace("query_", ""): value for key, value in features.items() if key.startswith("query_")}
    features_passage = {
        key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_")
    }
    max_seq_len = features_passage.get("input_ids").shape[-1]
    features_passage = {key: value.view(-1, max_seq_len) for key, value in features_passage.items()}

    # test model encodings
    query_vector = model.language_model1(**features_query)[0]
    passage_vector = model.language_model2(**features_passage)[0]

    assert torch.all(
        torch.le(
            query_vector[0, :10].cpu()
            - torch.tensor([-0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405, 0.2285, 0.0893]),
            torch.ones((1, 10)) * 0.0001,
        )
    )
    assert torch.all(
        torch.le(
            passage_vector[0, :10].cpu()
            - torch.tensor([0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969, -0.0555, 0.3405, -0.8691]),
            torch.ones((1, 10)) * 0.0001,
        )
    )
    assert torch.all(
        torch.le(
            passage_vector[1, :10].cpu()
            - torch.tensor([-0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306, 0.1156, 0.3350, -0.3412]),
            torch.ones((1, 10)) * 0.0001,
        )
    )

    # test logits and loss
    embeddings = model(
        query_input_ids=features.get("query_input_ids", None),
        query_segment_ids=features.get("query_segment_ids", None),
        query_attention_mask=features.get("query_attention_mask", None),
        passage_input_ids=features.get("passage_input_ids", None),
        passage_segment_ids=features.get("passage_segment_ids", None),
        passage_attention_mask=features.get("passage_attention_mask", None),
    )
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(query_emb, passage_emb).cpu()
    assert torch.all(
        torch.le(similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e00]]), torch.ones((1, 2)) * 0.0001)
    )
    assert (loss[0].item() - 0.0018) <= 0.0001


query_input_ids = [
    torch.tensor([101, 2073, 2003, 3317, 2006, 1996, 2940, 2241, 2006, 102]),
    torch.tensor([101, 2043, 2106, 1996, 2548, 2155, 11092, 1996, 2171, 10064]),
    torch.tensor([101, 2054, 2003, 1037, 4937, 102, 0, 0, 0, 0]),
]
query_attention_mask = [
    torch.tensor(range(10)).unsqueeze(-1),
    torch.tensor(range(11)).unsqueeze(-1),
    torch.tensor(range(6)).unsqueeze(-1),
]
passage_ids = {
    "titled": [
        torch.tensor(
            [
                [101, 3317, 2006, 1996, 2940, 102, 3317, 2006, 1996, 2940],
                [101, 3317, 2940, 1010, 2047, 2148, 3575, 102, 8765, 2061],
                [101, 3317, 2940, 1010, 27492, 102, 3419, 18874, 3385, 1010],
            ]
        ),
        torch.tensor(
            [
                [101, 2160, 1997, 10064, 102, 2160, 1997, 10064, 1996, 2160],
                [101, 26902, 1010, 11017, 1997, 10387, 102, 2384, 1010, 1998],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
            ]
        ),
        torch.tensor(
            [
                [101, 2516, 2007, 1000, 2569, 3494, 1000, 102, 2023, 2003],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
            ]
        ),
    ],
    "untitled": [
        torch.tensor(
            [
                [101, 3317, 2006, 1996, 2940, 1000, 3317, 2006, 1996, 2940],
                [101, 8765, 2061, 2004, 2000, 5438, 1037, 8084, 10527, 5701],
                [101, 3419, 18874, 3385, 1010, 3818, 1000, 1000, 2152, 2006],
            ]
        ),
        torch.tensor(
            [
                [101, 2160, 1997, 10064, 1996, 2160, 1997, 10064, 2003, 1996],
                [101, 2384, 1010, 1998, 2001, 2000, 2202, 2173, 1999, 1037],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
            ]
        ),
        torch.tensor(
            [
                [101, 2023, 2003, 1037, 1026, 7308, 1028, 6251, 1012, 8870],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
            ]
        ),
    ],
}

passage_attention = {
    "titled": [
        [
            torch.tensor(range(140)).unsqueeze(-1),
            torch.tensor(range(130)).unsqueeze(-1),
            torch.tensor(range(127)).unsqueeze(-1),
        ],
        [
            torch.tensor(range(132)).unsqueeze(-1),
            torch.tensor(range(121)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
        ],
        [
            torch.tensor(range(22)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
        ],
    ],
    "untitled": [
        [
            torch.tensor(range(135)).unsqueeze(-1),
            torch.tensor(range(123)).unsqueeze(-1),
            torch.tensor(range(122)).unsqueeze(-1),
        ],
        [
            torch.tensor(range(128)).unsqueeze(-1),
            torch.tensor(range(115)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
        ],
        [
            torch.tensor(range(15)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
            torch.tensor(range(3)).unsqueeze(-1),
        ],
    ],
}
labels1 = [[1, 0], [1, 0], [1, 0]]
labels2 = [[1, 0, 0], [1, 0, 0], [1, 0, 0]]


@pytest.mark.parametrize(
    "embed_title, passage_ids, passage_attns",
    [
        (True, passage_ids["titled"], passage_attention["titled"]),
        (False, passage_ids["untitled"], passage_attention["untitled"]),
    ],
)
@pytest.mark.parametrize("use_fast", [True, False])
@pytest.mark.parametrize("num_hard_negatives, labels", [(1, labels1), (2, labels2)])
def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_hard_negatives, labels):
    dict = [
        {
            "query": "where is castle on the hill based on",
            "answers": ["Framlingham Castle"],
            "passages": [
                {
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
                    "title": "Castle on the Hill",
                    "label": "positive",
                    "external_id": "19930582",
                },
                {
                    "text": 'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
                    "title": "Castle Hill, New South Wales",
                    "label": "hard_negative",
                    "external_id": "1977568",
                },
                {
                    "text": 'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
                    "title": "Castle Hill, Townsville",
                    "label": "hard_negative",
                    "external_id": "3643705",
                },
            ],
        },
        {
            "query": "when did the royal family adopt the name windsor",
            "answers": ["in 1917"],
            "passages": [
                {
                    "text": 'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
                    "title": "House of Windsor",
                    "label": "positive",
                    "external_id": "1478954",
                },
                {
                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
                    "title": "Camilla, Duchess of Cornwall",
                    "label": "hard_negative",
                    "external_id": "1399730",
                },
            ],
        },
        {
            "query": "what is a cat?",
            "answers": ["animal", "feline"],
            "passages": [
                {
                    "text": "This is a <mask> sentence. Cats are good pets.",
                    "title": 'title with "special characters" ',
                    "label": "positive",
                    "external_id": "0",
                },
                {
                    "text": "2nd text => More text about cats is good",
                    "title": "2nd title \n",
                    "label": "positive",
                    "external_id": "1",
                },
            ],
        },
    ]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=embed_title,
        num_hard_negatives=num_hard_negatives,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )

    for i, d in enumerate(dict):
        dataset, tensor_names, _, baskets = processor.dataset_from_dicts(dicts=[d], return_baskets=True)
        feat = baskets[0].samples[0].features
        assert torch.all(torch.eq(torch.tensor(feat[0]["query_input_ids"][:10]), query_input_ids[i]))
        assert len(torch.tensor(feat[0]["query_segment_ids"]).nonzero()) == 0
        assert torch.all(torch.eq(torch.tensor(feat[0]["query_attention_mask"]).nonzero(), query_attention_mask[i]))

        positive_indices = np.where(np.array(feat[0]["label_ids"]) == 1)[0].item()
        assert torch.all(
            torch.eq(
                torch.tensor(feat[0]["passage_input_ids"])[positive_indices, :10], passage_ids[i][positive_indices]
            )
        )
        for j in range(num_hard_negatives + 1):
            assert torch.all(
                torch.eq(torch.tensor(feat[0]["passage_attention_mask"][j]).nonzero(), passage_attns[i][j])
            )
        assert torch.all(
            torch.eq(torch.tensor(feat[0]["label_ids"]), torch.tensor(labels[i])[: num_hard_negatives + 1])
        )
        assert len(torch.tensor(feat[0]["passage_segment_ids"]).nonzero()) == 0


@pytest.mark.parametrize("use_fast", [False])
@pytest.mark.parametrize("embed_title", [True, False])
def test_dpr_processor_empty_title(use_fast, embed_title):
    dict = {
        "query": "what is a cat?",
        "passages": [
            {
                "title": "",
                "text": "Director Radio Iași); Dragoș-Liviu Vîlceanu; Mihnea-Adrian Vîlceanu; Nathalie-Teona",
                "label": "positive",
                "external_id": "b21eaeff-e08b-4548-b5e0-a280f6f4efef",
            }
        ],
    }

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=embed_title,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )
    _ = processor.dataset_from_dicts(dicts=[dict])


def test_dpr_problematic():
    erroneous_dicts = [
        {
            "query": [1],
            "answers": ["Framlingham Castle"],
            "passages": [
                {
                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
                    "title": "Castle on the Hill",
                    "label": "positive",
                    "external_id": "19930582",
                },
                {
                    "text": 'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
                    "title": "Castle Hill, New South Wales",
                    "label": "hard_negative",
                    "external_id": "1977568",
                },
                {
                    "text": 'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
                    "title": "Castle Hill, Townsville",
                    "label": "hard_negative",
                    "external_id": "3643705",
                },
            ],
        },
        {
            "query": "when did the royal family adopt the name windsor",
            "answers": ["in 1917"],
            "passages": [
                {
                    "text2": 'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
                    "title": "House of Windsor",
                    "label": "positive",
                    "external_id": "1478954",
                },
                {
                    "text2": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
                    "title": "Camilla, Duchess of Cornwall",
                    "label": "hard_negative",
                    "external_id": "1399730",
                },
            ],
        },
        {
            "query": "what is a cat?",
            "answers": ["animal", "feline"],
            "passages": [
                {
                    "text": "This is a <mask> sentence. Cats are good pets.",
                    "title": 'title with "special characters" ',
                    "label": "positive",
                    "external_id": "0",
                },
                {
                    "text": "2nd text => More text about cats is good",
                    "title": "2nd title \n",
                    "label": "positive",
                    "external_id": "1",
                },
            ],
        },
    ]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True
    )
    assert problematic_ids == {0, 1}


def test_dpr_query_only():
    erroneous_dicts = [
        {"query": "where is castle on the hill based on", "answers": ["Framlingham Castle"]},
        {"query": "where is castle on the hill 2 based on", "answers": ["Framlingham Castle 2"]},
    ]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True
    )
    assert len(problematic_ids) == 0
    assert tensor_names == ["query_input_ids", "query_segment_ids", "query_attention_mask"]


def test_dpr_context_only():
    erroneous_dicts = [
        {
            "passages": [
                {
                    "text": "House of Windsor 2 The House of Windsor is the reigning royal house of the United",
                    "title": "House of Windsor",
                    "label": "positive",
                    "external_id": "1478954",
                },
                {
                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious",
                    "title": "Camilla, Duchess of Cornwall",
                    "label": "hard_negative",
                    "external_id": "1399730",
                },
            ]
        },
        {
            "passages": [
                {
                    "text": "House of Windsor The House of Windsor is the reigning royal house of the",
                    "title": "House of Windsor",
                    "label": "positive",
                    "external_id": "1478954",
                },
                {
                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent",
                    "title": "Camilla, Duchess of Cornwall",
                    "label": "hard_negative",
                    "external_id": "1399730",
                },
            ]
        },
    ]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )

    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
        dicts=erroneous_dicts, return_baskets=True
    )
    assert len(problematic_ids) == 0
    assert tensor_names == ["passage_input_ids", "passage_segment_ids", "passage_attention_mask", "label_ids"]


def test_dpr_processor_save_load(tmp_path):
    d = {
        "query": "big little lies season 2 how many episodes ?",
        "passages": [
            {
                "title": "Big Little Lies (TV series)",
                "text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",
                "label": "positive",
                "external_id": "18768923",
            },
            {
                "title": "Little People, Big World",
                "text": 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
                "label": "hard_negative",
                "external_id": "7459116",
            },
            {
                "title": "Cormac McCarthy",
                "text": 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
                "label": "negative",
                "passage_id": "2145653",
            },
        ],
    }

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = get_tokenizer(query_tok)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    passage_tokenizer = get_tokenizer(passage_tok)
    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False,
    )
    save_dir = f"{tmp_path}/testsave/dpr_processor"
    processor.save(save_dir=save_dir)
    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=save_dir)
    dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False)
    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])


@pytest.mark.parametrize(
    "query_and_passage_model",
    [
        {
            "query": "etalab-ia/dpr-question_encoder-fr_qa-camembert",
            "passage": "etalab-ia/dpr-ctx_encoder-fr_qa-camembert",
        },
        {
            "query": "deepset/gbert-base-germandpr-question_encoder",
            "passage": "deepset/gbert-base-germandpr-ctx_encoder",
        },
        {"query": "facebook/dpr-question_encoder-single-nq-base", "passage": "facebook/dpr-ctx_encoder-single-nq-base"},
    ],
)
def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_passage_model: Tuple[str, str]):
    """
    This test compares 1) a model that was loaded from model hub with
    2) a model from model hub that was saved to disk and then loaded from disk and
    3) a model in FARM style that was saved to disk and then loaded from disk
    """

    d = {
        "query": "Comment s'appelle le portail open data du gouvernement?",
        "passages": [
            {
                "title": "Etalab",
                "text": "Etalab est une administration publique française qui fait notamment office "
                "de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre "
                "de sa stratégie dans le domaine de la donnée (ouverture et partage des données "
                "publiques ou open data, exploitation des données et intelligence artificielle...). "
                "Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement "
                "français data.gouv.fr. Etalab promeut également une plus grande ouverture "
                "l'administration sur la société (gouvernement ouvert) : transparence de l'action "
                "publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, "
                "l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que "
                "les synergies avec la société civile pour décloisonner l’administration et favoriser "
                "l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. "
                "À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie "
                "de maturation issues du monde de la recherche. Cette entité chargée de l'innovation "
                "au sein de l'administration doit contribuer à l'amélioration du service public grâce "
                "au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont "
                "les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé "
                "par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une "
                "trentaine de personnes.",
                "label": "positive",
                "external_id": "1",
            }
        ],
    }

    # load model from model hub
    query_embedding_model = query_and_passage_model["query"]
    passage_embedding_model = query_and_passage_model["passage"]
    query_tokenizer = get_tokenizer(
        pretrained_model_name_or_path=query_embedding_model
    )  # tokenizer class is inferred automatically
    query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model)
    passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model)
    passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model)

    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_passage=256,
        max_seq_len_query=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        embed_title=True,
        num_hard_negatives=0,
        num_positives=1,
    )
    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = BiAdaptiveModel(
        language_model1=query_encoder,
        language_model2=passage_encoder,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )
    model.connect_heads_with_processor(processor.tasks, require_labels=False)

    # save model that was loaded from model hub to disk
    save_dir = f"{tmp_path}/testsave/dpr_model"
    query_encoder_dir = "query_encoder"
    passage_encoder_dir = "passage_encoder"
    model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
    query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
    passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")

    # load model from disk
    loaded_query_tokenizer = get_tokenizer(
        pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True
    )  # tokenizer class is inferred automatically
    loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir)
    loaded_passage_tokenizer = get_tokenizer(
        pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True
    )
    loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)

    loaded_processor = TextSimilarityProcessor(
        query_tokenizer=loaded_query_tokenizer,
        passage_tokenizer=loaded_passage_tokenizer,
        max_seq_len_passage=256,
        max_seq_len_query=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        embed_title=True,
        num_hard_negatives=0,
        num_positives=1,
    )
    loaded_prediction_head = TextSimilarityHead(similarity_function="dot_product")

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    loaded_model = BiAdaptiveModel(
        language_model1=loaded_query_encoder,
        language_model2=loaded_passage_encoder,
        prediction_heads=[loaded_prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )
    loaded_model.connect_heads_with_processor(loaded_processor.tasks, require_labels=False)

    # compare model loaded from model hub with model loaded from disk
    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])

    # generate embeddings with model loaded from model hub
    dataset, tensor_names, _, baskets = processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset, sampler=SequentialSampler(dataset), batch_size=16, tensor_names=tensor_names
    )
    all_embeddings = {"query": [], "passages": []}
    model.eval()

    for batch in tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
        with torch.no_grad():
            query_embeddings, passage_embeddings = model.forward(
                query_input_ids=batch.get("query_input_ids", None),
                query_segment_ids=batch.get("query_segment_ids", None),
                query_attention_mask=batch.get("query_attention_mask", None),
                passage_input_ids=batch.get("passage_input_ids", None),
                passage_segment_ids=batch.get("passage_segment_ids", None),
                passage_attention_mask=batch.get("passage_attention_mask", None),
            )[0]
            if query_embeddings is not None:
                all_embeddings["query"].append(query_embeddings.cpu().numpy())
            if passage_embeddings is not None:
                all_embeddings["passages"].append(passage_embeddings.cpu().numpy())

    if all_embeddings["passages"]:
        all_embeddings["passages"] = np.concatenate(all_embeddings["passages"])
    if all_embeddings["query"]:
        all_embeddings["query"] = np.concatenate(all_embeddings["query"])

    # generate embeddings with model loaded from disk
    dataset2, tensor_names2, _, baskets2 = loaded_processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset2, sampler=SequentialSampler(dataset2), batch_size=16, tensor_names=tensor_names2
    )
    all_embeddings2 = {"query": [], "passages": []}
    loaded_model.eval()

    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
        with torch.no_grad():
            query_embeddings, passage_embeddings = loaded_model.forward(
                query_input_ids=batch.get("query_input_ids", None),
                query_segment_ids=batch.get("query_segment_ids", None),
                query_attention_mask=batch.get("query_attention_mask", None),
                passage_input_ids=batch.get("passage_input_ids", None),
                passage_segment_ids=batch.get("passage_segment_ids", None),
                passage_attention_mask=batch.get("passage_attention_mask", None),
            )[0]
            if query_embeddings is not None:
                all_embeddings2["query"].append(query_embeddings.cpu().numpy())
            if passage_embeddings is not None:
                all_embeddings2["passages"].append(passage_embeddings.cpu().numpy())

    if all_embeddings2["passages"]:
        all_embeddings2["passages"] = np.concatenate(all_embeddings2["passages"])
    if all_embeddings2["query"]:
        all_embeddings2["query"] = np.concatenate(all_embeddings2["query"])

    # compare embeddings of model loaded from model hub and model loaded from disk
    assert np.array_equal(all_embeddings["query"][0], all_embeddings2["query"][0])

    # save the model that was loaded from disk to disk
    save_dir = f"{tmp_path}/testsave/dpr_model"
    query_encoder_dir = "query_encoder"
    passage_encoder_dir = "passage_encoder"
    loaded_model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
    loaded_query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
    loaded_passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")

    # load model from disk
    query_tokenizer = get_tokenizer(
        pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir
    )  # tokenizer class is inferred automatically
    query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir)
    passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)
    passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)

    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_passage=256,
        max_seq_len_query=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        embed_title=True,
        num_hard_negatives=0,
        num_positives=1,
    )
    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = BiAdaptiveModel(
        language_model1=query_encoder,
        language_model2=passage_encoder,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )
    model.connect_heads_with_processor(processor.tasks, require_labels=False)

    # compare a model loaded from disk that originated from the model hub and was then saved disk with
    # a model loaded from disk that also originated from a FARM style model that was saved to disk
    dataset3, tensor_names3, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
    assert np.array_equal(dataset3.tensors[0], dataset2.tensors[0])

    # generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
    dataset3, tensor_names3, _, baskets3 = loaded_processor.dataset_from_dicts(
        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
    )

    data_loader = NamedDataLoader(
        dataset=dataset3, sampler=SequentialSampler(dataset3), batch_size=16, tensor_names=tensor_names3
    )
    all_embeddings3 = {"query": [], "passages": []}
    loaded_model.eval()

    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
        batch = {key: batch[key].to(device) for key in batch}

        # get logits
        with torch.no_grad():
            query_embeddings, passage_embeddings = loaded_model.forward(
                query_input_ids=batch.get("query_input_ids", None),
                query_segment_ids=batch.get("query_segment_ids", None),
                query_attention_mask=batch.get("query_attention_mask", None),
                passage_input_ids=batch.get("passage_input_ids", None),
                passage_segment_ids=batch.get("passage_segment_ids", None),
                passage_attention_mask=batch.get("passage_attention_mask", None),
            )[0]
            if query_embeddings is not None:
                all_embeddings3["query"].append(query_embeddings.cpu().numpy())
            if passage_embeddings is not None:
                all_embeddings3["passages"].append(passage_embeddings.cpu().numpy())

    if all_embeddings3["passages"]:
        all_embeddings3["passages"] = np.concatenate(all_embeddings3["passages"])
    if all_embeddings3["query"]:
        all_embeddings3["query"] = np.concatenate(all_embeddings3["query"])

    # compare embeddings of model loaded from model hub and model loaded from disk that originated from a FARM style
    # model that was saved to disk earlier
    assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])


# TODO fix CI errors (test pass locally or on AWS, next steps: isolate PyTorch versions once FARM dependency is removed)
# def test_dpr_training():
#     batch_size = 1
#     n_epochs = 1
#     distributed = False  # enable for multi GPU training via DDP
#     evaluate_every = 1
#     question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
#     passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
#     do_lower_case = True
#     use_fast = True
#     similarity_function = "dot_product"
#
#
#
#     device, n_gpu = initialize_device_settings(use_cuda=False)
#
#     query_tokenizer = get_tokenizer(pretrained_model_name_or_path=question_lang_model,
#                                      do_lower_case=do_lower_case, use_fast=use_fast)
#     passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_lang_model,
#                                        do_lower_case=do_lower_case, use_fast=use_fast)
#     label_list = ["hard_negative", "positive"]
#
#     processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
#                                         passage_tokenizer=passage_tokenizer,
#                                         max_seq_len_query=10,
#                                         max_seq_len_passage=10,
#                                         label_list=label_list,
#                                         metric="text_similarity_metric",
#                                         data_dir="samples/dpr/",
#                                         train_filename="sample.json",
#                                         dev_filename="sample.json",
#                                         test_filename=None,
#                                         embed_title=True,
#                                         num_hard_negatives=1,
#                                         dev_split=0,
#                                         max_samples=2)
#
#     data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
#
#     question_language_model = get_language_model(pretrained_model_name_or_path=question_lang_model,
#                                                  language_model_class="DPRQuestionEncoder")
#     passage_language_model = get_language_model(pretrained_model_name_or_path=passage_lang_model,
#                                                 language_model_class="DPRContextEncoder")
#
#     prediction_head = TextSimilarityHead(similarity_function=similarity_function)
#
#     model = BiAdaptiveModel(
#         language_model1=question_language_model,
#         language_model2=passage_language_model,
#         prediction_heads=[prediction_head],
#         embeds_dropout_prob=0.1,
#         lm1_output_types=["per_sequence"],
#         lm2_output_types=["per_sequence"],
#         device=device,
#     )
#
#     model, optimizer, lr_schedule = initialize_optimizer(
#         model=model,
#         learning_rate=1e-5,
#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
#                         "eps": 1e-08},
#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
#         n_batches=len(data_silo.loaders["train"]),
#         n_epochs=n_epochs,
#         grad_acc_steps=1,
#         device=device,
#         distributed=distributed
#     )
#
#     trainer = Trainer(
#         model=model,
#         optimizer=optimizer,
#         data_silo=data_silo,
#         epochs=n_epochs,
#         n_gpu=n_gpu,
#         lr_schedule=lr_schedule,
#         evaluate_every=evaluate_every,
#         device=device,
#     )
#
#     trainer.train()
#
#     ######## save and load model again
#     save_dir = Path("testsave/dpr-model")
#     model.save(save_dir)
#     del model
#
#     model2 = BiAdaptiveModel.load(save_dir, device=device)
#     model2, optimizer2, lr_schedule = initialize_optimizer(
#         model=model2,
#         learning_rate=1e-5,
#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
#                         "eps": 1e-08},
#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
#         n_batches=len(data_silo.loaders["train"]),
#         n_epochs=n_epochs,
#         grad_acc_steps=1,
#         device=device,
#         distributed=distributed
#     )
#     trainer2 = Trainer(
#         model=model2,
#         optimizer=optimizer,
#         data_silo=data_silo,
#         epochs=n_epochs,
#         n_gpu=n_gpu,
#         lr_schedule=lr_schedule,
#         evaluate_every=evaluate_every,
#         device=device,
#     )
#
#     trainer2.train()
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								from typing import Tuple
 								import os
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								import logging
 								from pathlib import Path
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								import numpy as np
 								import pytest
 								import torch
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								from torch.utils.data import SequentialSampler
 								from tqdm import tqdm
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								from transformers import DPRQuestionEncoder
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								from haystack.modeling.data_handler.dataloader import NamedDataLoader
 								from haystack.modeling.data_handler.processor import TextSimilarityProcessor
 								from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								from haystack.modeling.model.language_model import get_language_model, DPREncoder
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								from haystack.modeling.model.prediction_head import TextSimilarityHead
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								from haystack.modeling.model.tokenization import get_tokenizer
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								from haystack.modeling.utils import set_all_seeds, initialize_device_settings
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								def test_dpr_modules(caplog=None):
 								    if caplog:
 								        caplog.set_level(logging.CRITICAL)
 								    set_all_seeds(seed=42)
-												Standardize initialisation of device settings (#1683)

* Use initialize_device_settings in all nodes

* Set StreamHandler level to INFO

* Add latest docstring and tutorial changes

* work in progress

* Standardize device initialization

* Add latest docstring and tutorial changes

* Adapt device initialization in Reader's train method

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-11-09 12:44:20 +01:00
+								    devices, n_gpu = initialize_device_settings(use_cuda=True)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    # 1.Create question and passage tokenizers
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True
 								    )
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        dev_filename="nq-dev.json",
 								        test_filename="nq-dev.json",
 								        embed_title=True,
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        num_hard_negatives=1,
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    )
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    question_language_model = DPREncoder(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path="bert-base-uncased",
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								        model_type="DPRQuestionEncoder",
 								        model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0},
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_language_model = DPREncoder(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path="bert-base-uncased",
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								        model_type="DPRContextEncoder",
 								        model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0},
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    prediction_head = TextSimilarityHead(similarity_function="dot_product")
 								    model = BiAdaptiveModel(
 								        language_model1=question_language_model,
 								        language_model2=passage_language_model,
 								        prediction_heads=[prediction_head],
 								        embeds_dropout_prob=0.0,
 								        lm1_output_types=["per_sequence"],
 								        lm2_output_types=["per_sequence"],
-												Standardize initialisation of device settings (#1683)

* Use initialize_device_settings in all nodes

* Set StreamHandler level to INFO

* Add latest docstring and tutorial changes

* work in progress

* Standardize device initialization

* Add latest docstring and tutorial changes

* Adapt device initialization in Reader's train method

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-11-09 12:44:20 +01:00
+								        device=devices[0],
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    )
 								    model.connect_heads_with_processor(processor.tasks)
 								    assert type(model) == BiAdaptiveModel
 								    assert type(processor) == TextSimilarityProcessor
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    assert type(question_language_model) == DPREncoder
 								    assert type(passage_language_model) == DPREncoder
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    # check embedding layer weights
 								    assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    d = {
 								        "query": "big little lies season 2 how many episodes",
 								        "passages": [
 								            {
 								                "title": "Big Little Lies (TV series)",
 								                "text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",
 								                "label": "positive",
 								                "external_id": "18768923",
 								            },
 								            {
 								                "title": "Little People, Big World",
 								                "text": 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
 								                "label": "hard_negative",
 								                "external_id": "7459116",
 								            },
 								            {
 								                "title": "Cormac McCarthy",
 								                "text": 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
 								                "label": "negative",
 								                "passage_id": "2145653",
 								            },
 								        ],
 								    }
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
-												Standardize initialisation of device settings (#1683)

* Use initialize_device_settings in all nodes

* Set StreamHandler level to INFO

* Add latest docstring and tutorial changes

* work in progress

* Standardize device initialization

* Add latest docstring and tutorial changes

* Adapt device initialization in Reader's train method

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-11-09 12:44:20 +01:00
+								    features = {key: val.unsqueeze(0).to(devices[0]) for key, val in zip(tensor_names, dataset[0])}
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    # test features
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert torch.all(
 								        torch.eq(
 								            features["query_input_ids"][0][:10].cpu(),
 								            torch.tensor([101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102]),
 								        )
 								    )
 								    assert torch.all(
 								        torch.eq(
 								            features["passage_input_ids"][0][0][:10].cpu(),
 								            torch.tensor([101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186]),
 								        )
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    assert len(features["query_segment_ids"][0].nonzero()) == 0
 								    assert len(features["passage_segment_ids"][0].nonzero()) == 0
 								    assert torch.all(torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10)))))
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert torch.all(
 								        torch.eq(features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127))))
 								    )
 								    assert torch.all(
 								        torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143))))
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    features_query = {key.replace("query_", ""): value for key, value in features.items() if key.startswith("query_")}
 								    features_passage = {
 								        key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_")
 								    }
 								    max_seq_len = features_passage.get("input_ids").shape[-1]
 								    features_passage = {key: value.view(-1, max_seq_len) for key, value in features_passage.items()}
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    # test model encodings
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_vector = model.language_model1(**features_query)[0]
 								    passage_vector = model.language_model2(**features_passage)[0]
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert torch.all(
 								        torch.le(
 								            query_vector[0, :10].cpu()
 								            - torch.tensor([-0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405, 0.2285, 0.0893]),
 								            torch.ones((1, 10)) * 0.0001,
 								        )
 								    )
 								    assert torch.all(
 								        torch.le(
 								            passage_vector[0, :10].cpu()
 								            - torch.tensor([0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969, -0.0555, 0.3405, -0.8691]),
 								            torch.ones((1, 10)) * 0.0001,
 								        )
 								    )
 								    assert torch.all(
 								        torch.le(
 								            passage_vector[1, :10].cpu()
 								            - torch.tensor([-0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306, 0.1156, 0.3350, -0.3412]),
 								            torch.ones((1, 10)) * 0.0001,
 								        )
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    # test logits and loss
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    embeddings = model(
 								        query_input_ids=features.get("query_input_ids", None),
 								        query_segment_ids=features.get("query_segment_ids", None),
 								        query_attention_mask=features.get("query_attention_mask", None),
 								        passage_input_ids=features.get("passage_input_ids", None),
 								        passage_segment_ids=features.get("passage_segment_ids", None),
 								        passage_attention_mask=features.get("passage_attention_mask", None),
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    query_emb, passage_emb = embeddings[0]
 								    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
 								    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))
 								    loss = model.logits_to_loss_per_head(embeddings, **features)
 								    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(query_emb, passage_emb).cpu()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert torch.all(
 								        torch.le(similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e00]]), torch.ones((1, 2)) * 0.0001)
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    assert (loss[0].item() - 0.0018) <= 0.0001
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								query_input_ids = [
 								    torch.tensor([101, 2073, 2003, 3317, 2006, 1996, 2940, 2241, 2006, 102]),
 								    torch.tensor([101, 2043, 2106, 1996, 2548, 2155, 11092, 1996, 2171, 10064]),
 								    torch.tensor([101, 2054, 2003, 1037, 4937, 102, 0, 0, 0, 0]),
 								]
 								query_attention_mask = [
 								    torch.tensor(range(10)).unsqueeze(-1),
 								    torch.tensor(range(11)).unsqueeze(-1),
 								    torch.tensor(range(6)).unsqueeze(-1),
 								]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								passage_ids = {
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    "titled": [
 								        torch.tensor(
 								            [
 								                [101, 3317, 2006, 1996, 2940, 102, 3317, 2006, 1996, 2940],
 								                [101, 3317, 2940, 1010, 2047, 2148, 3575, 102, 8765, 2061],
 								                [101, 3317, 2940, 1010, 27492, 102, 3419, 18874, 3385, 1010],
 								            ]
 								        ),
 								        torch.tensor(
 								            [
 								                [101, 2160, 1997, 10064, 102, 2160, 1997, 10064, 1996, 2160],
 								                [101, 26902, 1010, 11017, 1997, 10387, 102, 2384, 1010, 1998],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								            ]
 								        ),
 								        torch.tensor(
 								            [
 								                [101, 2516, 2007, 1000, 2569, 3494, 1000, 102, 2023, 2003],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								            ]
 								        ),
 								    ],
 								    "untitled": [
 								        torch.tensor(
 								            [
 								                [101, 3317, 2006, 1996, 2940, 1000, 3317, 2006, 1996, 2940],
 								                [101, 8765, 2061, 2004, 2000, 5438, 1037, 8084, 10527, 5701],
 								                [101, 3419, 18874, 3385, 1010, 3818, 1000, 1000, 2152, 2006],
 								            ]
 								        ),
 								        torch.tensor(
 								            [
 								                [101, 2160, 1997, 10064, 1996, 2160, 1997, 10064, 2003, 1996],
 								                [101, 2384, 1010, 1998, 2001, 2000, 2202, 2173, 1999, 1037],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								            ]
 								        ),
 								        torch.tensor(
 								            [
 								                [101, 2023, 2003, 1037, 1026, 7308, 1028, 6251, 1012, 8870],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								                [101, 102, 102, 0, 0, 0, 0, 0, 0, 0],
 								            ]
 								        ),
 								    ],
 								}
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								passage_attention = {
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    "titled": [
 								        [
 								            torch.tensor(range(140)).unsqueeze(-1),
 								            torch.tensor(range(130)).unsqueeze(-1),
 								            torch.tensor(range(127)).unsqueeze(-1),
 								        ],
 								        [
 								            torch.tensor(range(132)).unsqueeze(-1),
 								            torch.tensor(range(121)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								        ],
 								        [
 								            torch.tensor(range(22)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								        ],
 								    ],
 								    "untitled": [
 								        [
 								            torch.tensor(range(135)).unsqueeze(-1),
 								            torch.tensor(range(123)).unsqueeze(-1),
 								            torch.tensor(range(122)).unsqueeze(-1),
 								        ],
 								        [
 								            torch.tensor(range(128)).unsqueeze(-1),
 								            torch.tensor(range(115)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								        ],
 								        [
 								            torch.tensor(range(15)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								            torch.tensor(range(3)).unsqueeze(-1),
 								        ],
 								    ],
 								}
 								labels1 = [[1, 0], [1, 0], [1, 0]]
 								labels2 = [[1, 0, 0], [1, 0, 0], [1, 0, 0]]
 								@pytest.mark.parametrize(
 								    "embed_title, passage_ids, passage_attns",
 								    [
 								        (True, passage_ids["titled"], passage_attention["titled"]),
 								        (False, passage_ids["untitled"], passage_attention["untitled"]),
 								    ],
 								)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								@pytest.mark.parametrize("use_fast", [True, False])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								@pytest.mark.parametrize("num_hard_negatives, labels", [(1, labels1), (2, labels2)])
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_hard_negatives, labels):
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    dict = [
 								        {
 								            "query": "where is castle on the hill based on",
 								            "answers": ["Framlingham Castle"],
 								            "passages": [
 								                {
 								                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
 								                    "title": "Castle on the Hill",
 								                    "label": "positive",
 								                    "external_id": "19930582",
 								                },
 								                {
 								                    "text": 'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
 								                    "title": "Castle Hill, New South Wales",
 								                    "label": "hard_negative",
 								                    "external_id": "1977568",
 								                },
 								                {
 								                    "text": 'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
 								                    "title": "Castle Hill, Townsville",
 								                    "label": "hard_negative",
 								                    "external_id": "3643705",
 								                },
 								            ],
 								        },
 								        {
 								            "query": "when did the royal family adopt the name windsor",
 								            "answers": ["in 1917"],
 								            "passages": [
 								                {
 								                    "text": 'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
 								                    "title": "House of Windsor",
 								                    "label": "positive",
 								                    "external_id": "1478954",
 								                },
 								                {
 								                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
 								                    "title": "Camilla, Duchess of Cornwall",
 								                    "label": "hard_negative",
 								                    "external_id": "1399730",
 								                },
 								            ],
 								        },
 								        {
 								            "query": "what is a cat?",
 								            "answers": ["animal", "feline"],
 								            "passages": [
 								                {
 								                    "text": "This is a <mask> sentence. Cats are good pets.",
 								                    "title": 'title with "special characters" ',
 								                    "label": "positive",
 								                    "external_id": "0",
 								                },
 								                {
 								                    "text": "2nd text => More text about cats is good",
 								                    "title": "2nd title \n",
 								                    "label": "positive",
 								                    "external_id": "1",
 								                },
 								            ],
 								        },
 								    ]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=embed_title,
 								        num_hard_negatives=num_hard_negatives,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    for i, d in enumerate(dict):
 								        dataset, tensor_names, _, baskets = processor.dataset_from_dicts(dicts=[d], return_baskets=True)
 								        feat = baskets[0].samples[0].features
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        assert torch.all(torch.eq(torch.tensor(feat[0]["query_input_ids"][:10]), query_input_ids[i]))
 								        assert len(torch.tensor(feat[0]["query_segment_ids"]).nonzero()) == 0
 								        assert torch.all(torch.eq(torch.tensor(feat[0]["query_attention_mask"]).nonzero(), query_attention_mask[i]))
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								        positive_indices = np.where(np.array(feat[0]["label_ids"]) == 1)[0].item()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        assert torch.all(
 								            torch.eq(
 								                torch.tensor(feat[0]["passage_input_ids"])[positive_indices, :10], passage_ids[i][positive_indices]
 								            )
 								        )
 								        for j in range(num_hard_negatives + 1):
 								            assert torch.all(
 								                torch.eq(torch.tensor(feat[0]["passage_attention_mask"][j]).nonzero(), passage_attns[i][j])
 								            )
 								        assert torch.all(
 								            torch.eq(torch.tensor(feat[0]["label_ids"]), torch.tensor(labels[i])[: num_hard_negatives + 1])
 								        )
 								        assert len(torch.tensor(feat[0]["passage_segment_ids"]).nonzero()) == 0
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								@pytest.mark.parametrize("use_fast", [False])
 								@pytest.mark.parametrize("embed_title", [True, False])
 								def test_dpr_processor_empty_title(use_fast, embed_title):
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    dict = {
 								        "query": "what is a cat?",
 								        "passages": [
 								            {
 								                "title": "",
 								                "text": "Director Radio Iași); Dragoș-Liviu Vîlceanu; Mihnea-Adrian Vîlceanu; Nathalie-Teona",
 								                "label": "positive",
 								                "external_id": "b21eaeff-e08b-4548-b5e0-a280f6f4efef",
 								            }
 								        ],
 								    }
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=embed_title,
 								        num_hard_negatives=1,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
 								    _ = processor.dataset_from_dicts(dicts=[dict])
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								def test_dpr_problematic():
 								    erroneous_dicts = [
 								        {
 								            "query": [1],
 								            "answers": ["Framlingham Castle"],
 								            "passages": [
 								                {
 								                    "text": 'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
 								                    "title": "Castle on the Hill",
 								                    "label": "positive",
 								                    "external_id": "19930582",
 								                },
 								                {
 								                    "text": 'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
 								                    "title": "Castle Hill, New South Wales",
 								                    "label": "hard_negative",
 								                    "external_id": "1977568",
 								                },
 								                {
 								                    "text": 'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
 								                    "title": "Castle Hill, Townsville",
 								                    "label": "hard_negative",
 								                    "external_id": "3643705",
 								                },
 								            ],
 								        },
 								        {
 								            "query": "when did the royal family adopt the name windsor",
 								            "answers": ["in 1917"],
 								            "passages": [
 								                {
 								                    "text2": 'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
 								                    "title": "House of Windsor",
 								                    "label": "positive",
 								                    "external_id": "1478954",
 								                },
 								                {
 								                    "text2": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
 								                    "title": "Camilla, Duchess of Cornwall",
 								                    "label": "hard_negative",
 								                    "external_id": "1399730",
 								                },
 								            ],
 								        },
 								        {
 								            "query": "what is a cat?",
 								            "answers": ["animal", "feline"],
 								            "passages": [
 								                {
 								                    "text": "This is a <mask> sentence. Cats are good pets.",
 								                    "title": 'title with "special characters" ',
 								                    "label": "positive",
 								                    "external_id": "0",
 								                },
 								                {
 								                    "text": "2nd text => More text about cats is good",
 								                    "title": "2nd title \n",
 								                    "label": "positive",
 								                    "external_id": "1",
 								                },
 								            ],
 								        },
 								    ]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=True,
 								        num_hard_negatives=1,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
 								    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
 								        dicts=erroneous_dicts, return_baskets=True
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    assert problematic_ids == {0, 1}
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								def test_dpr_query_only():
 								    erroneous_dicts = [
-												fix pip backtracking issue (#2281)

* fix pip backtracking issue

* restrict azure-core version

* Remove the trailing comma

* Add skip_magic_trailing_comma in pyproject.toml for pydoc compatibility

* Pin pydoc-markdown _again_

Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-07 19:25:33 +01:00
+								        {"query": "where is castle on the hill based on", "answers": ["Framlingham Castle"]},
 								        {"query": "where is castle on the hill 2 based on", "answers": ["Framlingham Castle 2"]},
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    ]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=True,
 								        num_hard_negatives=1,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
 								    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
 								        dicts=erroneous_dicts, return_baskets=True
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    assert len(problematic_ids) == 0
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert tensor_names == ["query_input_ids", "query_segment_ids", "query_attention_mask"]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								def test_dpr_context_only():
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    erroneous_dicts = [
 								        {
 								            "passages": [
 								                {
 								                    "text": "House of Windsor 2 The House of Windsor is the reigning royal house of the United",
 								                    "title": "House of Windsor",
 								                    "label": "positive",
 								                    "external_id": "1478954",
 								                },
 								                {
 								                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious",
 								                    "title": "Camilla, Duchess of Cornwall",
 								                    "label": "hard_negative",
 								                    "external_id": "1399730",
 								                },
 								            ]
 								        },
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								        {
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								            "passages": [
 								                {
 								                    "text": "House of Windsor The House of Windsor is the reigning royal house of the",
 								                    "title": "House of Windsor",
 								                    "label": "positive",
 								                    "external_id": "1478954",
 								                },
 								                {
 								                    "text": "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent",
 								                    "title": "Camilla, Duchess of Cornwall",
 								                    "label": "hard_negative",
 								                    "external_id": "1399730",
 								                },
 								            ]
 								        },
 								    ]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=True,
 								        num_hard_negatives=1,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
 								    dataset, tensor_names, problematic_ids, baskets = processor.dataset_from_dicts(
 								        dicts=erroneous_dicts, return_baskets=True
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    assert len(problematic_ids) == 0
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert tensor_names == ["passage_input_ids", "passage_segment_ids", "passage_attention_mask", "label_ids"]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
-												Pipeline's YAML: syntax validation (#2226)

* Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes

* Make error composition work properly

* Clarify typing

* Help mypy a bit more

* Update Documentation & Code Style

* Enable autogenerated docs for Milvus1 and 2 separately

* Revert "Enable autogenerated docs for Milvus1 and 2 separately"

This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d.

* Update Documentation & Code Style

* Re-enable 'additionalProperties: False'

* Add pipeline.type to JSON Schema, was somehow forgotten

* Disable additionalProperties on the pipeline properties too

* Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future)

* Cal super in PipelineValidationError

* Improve _read_pipeline_config_from_yaml's error handling

* Fix generate_json_schema.py to include document stores

* Fix json schemas (retro-fix 1.1.0 again)

* Improve custom errors printing, add link to docs

* Add function in BaseComponent to list its subclasses in a module

* Make some document stores base classes abstract

* Add marker 'integration' in pytest flags

* Slighly improve validation of pipelines at load

* Adding tests for YAML loading and validation

* Make custom_query Optional for validation issues

* Fix bug in _read_pipeline_config_from_yaml

* Improve error handling in BasePipeline and Pipeline and add DAG check

* Move json schema generation into haystack/nodes/_json_schema.py (useful for tests)

* Simplify errors slightly

* Add some YAML validation tests

* Remove load_from_config from BasePipeline, it was never used anyway

* Improve tests

* Include json-schemas in package

* Fix conftest imports

* Make BasePipeline abstract

* Improve mocking by making the test independent from the YAML version

* Add exportable_to_yaml decorator to forget about set_config on mock nodes

* Fix mypy errors

* Comment out one monkeypatch

* Fix typing again

* Improve error message for validation

* Add required properties to pipelines

* Fix YAML version for REST API YAMLs to 1.2.0

* Fix load_from_yaml call in load_from_deepset_cloud

* fix HaystackError.__getattr__

* Add super().__init__()in most nodes and docstore, comment set_config

* Remove type from REST API pipelines

* Remove useless init from doc2answers

* Call super in Seq3SeqGenerator

* Typo in deepsetcloud.py

* Fix rest api indexing error mismatch and mock version of JSON schema in all tests

* Working on pipeline tests

* Improve errors printing slightly

* Add back test_pipeline.yaml

* _json_schema.py supports different versions with identical schemas

* Add type to 0.7 schema for backwards compatibility

* Fix small bug in _json_schema.py

* Try alternative to generate json schemas on the CI

* Update Documentation & Code Style

* Make linux CI match autoformat CI

* Fix super-init-not-called

* Accidentally committed file

* Update Documentation & Code Style

* fix test_summarizer_translation.py's import

* Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args

* Fix json schema for ray tests too

* Update Documentation & Code Style

* Reintroduce validation

* Usa unstable version in tests and rest api

* Make unstable support the latest versions

* Update Documentation & Code Style

* Remove needless fixture

* Make type in pipeline optional in the strings validation

* Fix schemas

* Fix string validation for pipeline type

* Improve validate_config_strings

* Remove type from test p[ipelines

* Update Documentation & Code Style

* Fix test_pipeline

* Removing more type from pipelines

* Temporary CI patc

* Fix issue with exportable_to_yaml never invoking the wrapped init

* rm stray file

* pipeline tests are green again

* Linux CI now needs .[all] to generate the schema

* Bugfixes, pipeline tests seems to be green

* Typo in version after merge

* Implement missing methods in Weaviate

* Trying to avoid FAISS tests from running in the Milvus1 test suite

* Fix some stray test paths and faiss index dumping

* Fix pytest markers list

* Temporarily disable cache to be able to see tests failures

* Fix pyproject.toml syntax

* Use only tmp_path

* Fix preprocessor signature after merge

* Fix faiss bug

* Fix Ray test

* Fix documentation issue by removing quotes from faiss type

* Update Documentation & Code Style

* use document properly in preprocessor tests

* Update Documentation & Code Style

* make preprocessor capable of handling documents

* import document

* Revert support for documents in preprocessor, do later

* Fix bug in _json_schema.py that was breaking validation

* re-enable cache

* Update Documentation & Code Style

* Simplify calling _json_schema.py from the CI

* Remove redundant ABC inheritance

* Ensure exportable_to_yaml works only on implementations

* Rename subclass to class_ in Meta

* Make run() and get_config() abstract in BasePipeline

* Revert unintended change in preprocessor

* Move outgoing_edges_input_node check inside try block

* Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX

* Add check for a RecursionError on validate_config_strings

* Address usages of _pipeline_config in data silo and elasticsearch

* Rename _pipeline_config into _init_parameters

* Fix pytest marker and remove unused imports

* Remove most redundant ABCs

* Rename _init_parameters into _component_configuration

* Remove set_config and type from _component_configuration's dict

* Remove last instances of set_config and replace with super().__init__()

* Implement __init_subclass__ approach

* Simplify checks on the existence of _component_configuration

* Fix faiss issue

* Dynamic generation of node schemas & weed out old schemas

* Add debatable test

* Add docstring to debatable test

* Positive diff between schemas implemented

* Improve diff printing

* Rename REST API YAML files to trigger IDE validation

* Fix typing issues

* Fix more typing

* Typo in YAML filename

* Remove needless type:ignore

* Add tests

* Fix tests & validation feedback for accessory classes in custom nodes

* Refactor RAGeneratorType out

* Fix broken import in conftest

* Improve source error handling

* Remove unused import in test_eval.py breaking tests

* Fix changed error message in tests matches too

* Normalize generate_openapi_specs.py and generate_json_schema.py in the actions

* Fix path to generate_openapi_specs.py in autoformat.yml

* Update Documentation & Code Style

* Add test for FAISSDocumentStore-like situations (superclass with init params)

* Update Documentation & Code Style

* Fix indentation

* Remove commented set_config

* Store model_name_or_path in FARMReader to use in DistillationDataSilo

* Rename _component_configuration into _component_config

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-15 11:17:26 +01:00
+								def test_dpr_processor_save_load(tmp_path):
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    d = {
 								        "query": "big little lies season 2 how many episodes ?",
 								        "passages": [
 								            {
 								                "title": "Big Little Lies (TV series)",
 								                "text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",
 								                "label": "positive",
 								                "external_id": "18768923",
 								            },
 								            {
 								                "title": "Little People, Big World",
 								                "text": 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
 								                "label": "hard_negative",
 								                "external_id": "7459116",
 								            },
 								            {
 								                "title": "Cormac McCarthy",
 								                "text": 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
 								                "label": "negative",
 								                "passage_id": "2145653",
 								            },
 								        ],
 								    }
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    query_tok = "facebook/dpr-question_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(query_tok)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    passage_tokenizer = get_tokenizer(passage_tok)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_query=256,
 								        max_seq_len_passage=256,
 								        data_dir="data/retriever",
 								        train_filename="nq-train.json",
 								        test_filename="nq-dev.json",
 								        embed_title=True,
 								        num_hard_negatives=1,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        shuffle_negatives=False,
 								    )
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    save_dir = f"{tmp_path}/testsave/dpr_processor"
 								    processor.save(save_dir=save_dir)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=save_dir)
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
 								@pytest.mark.parametrize(
 								    "query_and_passage_model",
 								    [
 								        {
 								            "query": "etalab-ia/dpr-question_encoder-fr_qa-camembert",
 								            "passage": "etalab-ia/dpr-ctx_encoder-fr_qa-camembert",
 								        },
 								        {
 								            "query": "deepset/gbert-base-germandpr-question_encoder",
 								            "passage": "deepset/gbert-base-germandpr-ctx_encoder",
 								        },
 								        {"query": "facebook/dpr-question_encoder-single-nq-base", "passage": "facebook/dpr-ctx_encoder-single-nq-base"},
 								    ],
 								)
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_passage_model: Tuple[str, str]):
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    """
 								    This test compares 1) a model that was loaded from model hub with
 ) a model from model hub that was saved to disk and then loaded from disk and
 ) a model in FARM style that was saved to disk and then loaded from disk
 								    """
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    d = {
 								        "query": "Comment s'appelle le portail open data du gouvernement?",
 								        "passages": [
 								            {
 								                "title": "Etalab",
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								                "text": "Etalab est une administration publique française qui fait notamment office "
 								                "de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre "
 								                "de sa stratégie dans le domaine de la donnée (ouverture et partage des données "
 								                "publiques ou open data, exploitation des données et intelligence artificielle...). "
 								                "Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement "
 								                "français data.gouv.fr. Etalab promeut également une plus grande ouverture "
 								                "l'administration sur la société (gouvernement ouvert) : transparence de l'action "
 								                "publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, "
 								                "l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que "
 								                "les synergies avec la société civile pour décloisonner l’administration et favoriser "
 								                "l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. "
 								                "À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie "
 								                "de maturation issues du monde de la recherche. Cette entité chargée de l'innovation "
 								                "au sein de l'administration doit contribuer à l'amélioration du service public grâce "
 								                "au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont "
 								                "les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé "
 								                "par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une "
 								                "trentaine de personnes.",
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								                "label": "positive",
 								                "external_id": "1",
-												fix pip backtracking issue (#2281)

* fix pip backtracking issue

* restrict azure-core version

* Remove the trailing comma

* Add skip_magic_trailing_comma in pyproject.toml for pydoc compatibility

* Pin pydoc-markdown _again_

Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-07 19:25:33 +01:00
+								            }
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        ],
 								    }
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
 								    # load model from model hub
 								    query_embedding_model = query_and_passage_model["query"]
 								    passage_embedding_model = query_and_passage_model["passage"]
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path=query_embedding_model
 								    )  # tokenizer class is inferred automatically
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model)
 								    passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model)
 								    passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
 								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_passage=256,
 								        max_seq_len_query=256,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        embed_title=True,
 								        num_hard_negatives=0,
 								        num_positives=1,
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    prediction_head = TextSimilarityHead(similarity_function="dot_product")
 								    if torch.cuda.is_available():
 								        device = torch.device("cuda")
 								    else:
 								        device = torch.device("cpu")
 								    model = BiAdaptiveModel(
 								        language_model1=query_encoder,
 								        language_model2=passage_encoder,
 								        prediction_heads=[prediction_head],
 								        embeds_dropout_prob=0.1,
 								        lm1_output_types=["per_sequence"],
 								        lm2_output_types=["per_sequence"],
 								        device=device,
 								    )
 								    model.connect_heads_with_processor(processor.tasks, require_labels=False)
 								    # save model that was loaded from model hub to disk
-												Pipeline's YAML: syntax validation (#2226)

* Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes

* Make error composition work properly

* Clarify typing

* Help mypy a bit more

* Update Documentation & Code Style

* Enable autogenerated docs for Milvus1 and 2 separately

* Revert "Enable autogenerated docs for Milvus1 and 2 separately"

This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d.

* Update Documentation & Code Style

* Re-enable 'additionalProperties: False'

* Add pipeline.type to JSON Schema, was somehow forgotten

* Disable additionalProperties on the pipeline properties too

* Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future)

* Cal super in PipelineValidationError

* Improve _read_pipeline_config_from_yaml's error handling

* Fix generate_json_schema.py to include document stores

* Fix json schemas (retro-fix 1.1.0 again)

* Improve custom errors printing, add link to docs

* Add function in BaseComponent to list its subclasses in a module

* Make some document stores base classes abstract

* Add marker 'integration' in pytest flags

* Slighly improve validation of pipelines at load

* Adding tests for YAML loading and validation

* Make custom_query Optional for validation issues

* Fix bug in _read_pipeline_config_from_yaml

* Improve error handling in BasePipeline and Pipeline and add DAG check

* Move json schema generation into haystack/nodes/_json_schema.py (useful for tests)

* Simplify errors slightly

* Add some YAML validation tests

* Remove load_from_config from BasePipeline, it was never used anyway

* Improve tests

* Include json-schemas in package

* Fix conftest imports

* Make BasePipeline abstract

* Improve mocking by making the test independent from the YAML version

* Add exportable_to_yaml decorator to forget about set_config on mock nodes

* Fix mypy errors

* Comment out one monkeypatch

* Fix typing again

* Improve error message for validation

* Add required properties to pipelines

* Fix YAML version for REST API YAMLs to 1.2.0

* Fix load_from_yaml call in load_from_deepset_cloud

* fix HaystackError.__getattr__

* Add super().__init__()in most nodes and docstore, comment set_config

* Remove type from REST API pipelines

* Remove useless init from doc2answers

* Call super in Seq3SeqGenerator

* Typo in deepsetcloud.py

* Fix rest api indexing error mismatch and mock version of JSON schema in all tests

* Working on pipeline tests

* Improve errors printing slightly

* Add back test_pipeline.yaml

* _json_schema.py supports different versions with identical schemas

* Add type to 0.7 schema for backwards compatibility

* Fix small bug in _json_schema.py

* Try alternative to generate json schemas on the CI

* Update Documentation & Code Style

* Make linux CI match autoformat CI

* Fix super-init-not-called

* Accidentally committed file

* Update Documentation & Code Style

* fix test_summarizer_translation.py's import

* Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args

* Fix json schema for ray tests too

* Update Documentation & Code Style

* Reintroduce validation

* Usa unstable version in tests and rest api

* Make unstable support the latest versions

* Update Documentation & Code Style

* Remove needless fixture

* Make type in pipeline optional in the strings validation

* Fix schemas

* Fix string validation for pipeline type

* Improve validate_config_strings

* Remove type from test p[ipelines

* Update Documentation & Code Style

* Fix test_pipeline

* Removing more type from pipelines

* Temporary CI patc

* Fix issue with exportable_to_yaml never invoking the wrapped init

* rm stray file

* pipeline tests are green again

* Linux CI now needs .[all] to generate the schema

* Bugfixes, pipeline tests seems to be green

* Typo in version after merge

* Implement missing methods in Weaviate

* Trying to avoid FAISS tests from running in the Milvus1 test suite

* Fix some stray test paths and faiss index dumping

* Fix pytest markers list

* Temporarily disable cache to be able to see tests failures

* Fix pyproject.toml syntax

* Use only tmp_path

* Fix preprocessor signature after merge

* Fix faiss bug

* Fix Ray test

* Fix documentation issue by removing quotes from faiss type

* Update Documentation & Code Style

* use document properly in preprocessor tests

* Update Documentation & Code Style

* make preprocessor capable of handling documents

* import document

* Revert support for documents in preprocessor, do later

* Fix bug in _json_schema.py that was breaking validation

* re-enable cache

* Update Documentation & Code Style

* Simplify calling _json_schema.py from the CI

* Remove redundant ABC inheritance

* Ensure exportable_to_yaml works only on implementations

* Rename subclass to class_ in Meta

* Make run() and get_config() abstract in BasePipeline

* Revert unintended change in preprocessor

* Move outgoing_edges_input_node check inside try block

* Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX

* Add check for a RecursionError on validate_config_strings

* Address usages of _pipeline_config in data silo and elasticsearch

* Rename _pipeline_config into _init_parameters

* Fix pytest marker and remove unused imports

* Remove most redundant ABCs

* Rename _init_parameters into _component_configuration

* Remove set_config and type from _component_configuration's dict

* Remove last instances of set_config and replace with super().__init__()

* Implement __init_subclass__ approach

* Simplify checks on the existence of _component_configuration

* Fix faiss issue

* Dynamic generation of node schemas & weed out old schemas

* Add debatable test

* Add docstring to debatable test

* Positive diff between schemas implemented

* Improve diff printing

* Rename REST API YAML files to trigger IDE validation

* Fix typing issues

* Fix more typing

* Typo in YAML filename

* Remove needless type:ignore

* Add tests

* Fix tests & validation feedback for accessory classes in custom nodes

* Refactor RAGeneratorType out

* Fix broken import in conftest

* Improve source error handling

* Remove unused import in test_eval.py breaking tests

* Fix changed error message in tests matches too

* Normalize generate_openapi_specs.py and generate_json_schema.py in the actions

* Fix path to generate_openapi_specs.py in autoformat.yml

* Update Documentation & Code Style

* Add test for FAISSDocumentStore-like situations (superclass with init params)

* Update Documentation & Code Style

* Fix indentation

* Remove commented set_config

* Store model_name_or_path in FARMReader to use in DistillationDataSilo

* Rename _component_configuration into _component_config

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-15 11:17:26 +01:00
+								    save_dir = f"{tmp_path}/testsave/dpr_model"
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    query_encoder_dir = "query_encoder"
 								    passage_encoder_dir = "passage_encoder"
 								    model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
 								    query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
 								    passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")
 								    # load model from disk
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    loaded_query_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True
 								    )  # tokenizer class is inferred automatically
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir)
 								    loaded_passage_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True
 								    )
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
 								    loaded_processor = TextSimilarityProcessor(
 								        query_tokenizer=loaded_query_tokenizer,
 								        passage_tokenizer=loaded_passage_tokenizer,
 								        max_seq_len_passage=256,
 								        max_seq_len_query=256,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        embed_title=True,
 								        num_hard_negatives=0,
 								        num_positives=1,
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    loaded_prediction_head = TextSimilarityHead(similarity_function="dot_product")
 								    if torch.cuda.is_available():
 								        device = torch.device("cuda")
 								    else:
 								        device = torch.device("cpu")
 								    loaded_model = BiAdaptiveModel(
 								        language_model1=loaded_query_encoder,
 								        language_model2=loaded_passage_encoder,
 								        prediction_heads=[loaded_prediction_head],
 								        embeds_dropout_prob=0.1,
 								        lm1_output_types=["per_sequence"],
 								        lm2_output_types=["per_sequence"],
 								        device=device,
 								    )
 								    loaded_model.connect_heads_with_processor(loaded_processor.tasks, require_labels=False)
 								    # compare model loaded from model hub with model loaded from disk
 								    dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
 								    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
 								    assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])
 								    # generate embeddings with model loaded from model hub
 								    dataset, tensor_names, _, baskets = processor.dataset_from_dicts(
 								        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
 								    )
 								    data_loader = NamedDataLoader(
 								        dataset=dataset, sampler=SequentialSampler(dataset), batch_size=16, tensor_names=tensor_names
 								    )
 								    all_embeddings = {"query": [], "passages": []}
 								    model.eval()
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    for batch in tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True):
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								        batch = {key: batch[key].to(device) for key in batch}
 								        # get logits
 								        with torch.no_grad():
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								            query_embeddings, passage_embeddings = model.forward(
 								                query_input_ids=batch.get("query_input_ids", None),
 								                query_segment_ids=batch.get("query_segment_ids", None),
 								                query_attention_mask=batch.get("query_attention_mask", None),
 								                passage_input_ids=batch.get("passage_input_ids", None),
 								                passage_segment_ids=batch.get("passage_segment_ids", None),
 								                passage_attention_mask=batch.get("passage_attention_mask", None),
 								            )[0]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								            if query_embeddings is not None:
 								                all_embeddings["query"].append(query_embeddings.cpu().numpy())
 								            if passage_embeddings is not None:
 								                all_embeddings["passages"].append(passage_embeddings.cpu().numpy())
 								    if all_embeddings["passages"]:
 								        all_embeddings["passages"] = np.concatenate(all_embeddings["passages"])
 								    if all_embeddings["query"]:
 								        all_embeddings["query"] = np.concatenate(all_embeddings["query"])
 								    # generate embeddings with model loaded from disk
 								    dataset2, tensor_names2, _, baskets2 = loaded_processor.dataset_from_dicts(
 								        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
 								    )
 								    data_loader = NamedDataLoader(
 								        dataset=dataset2, sampler=SequentialSampler(dataset2), batch_size=16, tensor_names=tensor_names2
 								    )
 								    all_embeddings2 = {"query": [], "passages": []}
 								    loaded_model.eval()
 								    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
 								        batch = {key: batch[key].to(device) for key in batch}
 								        # get logits
 								        with torch.no_grad():
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								            query_embeddings, passage_embeddings = loaded_model.forward(
 								                query_input_ids=batch.get("query_input_ids", None),
 								                query_segment_ids=batch.get("query_segment_ids", None),
 								                query_attention_mask=batch.get("query_attention_mask", None),
 								                passage_input_ids=batch.get("passage_input_ids", None),
 								                passage_segment_ids=batch.get("passage_segment_ids", None),
 								                passage_attention_mask=batch.get("passage_attention_mask", None),
 								            )[0]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								            if query_embeddings is not None:
 								                all_embeddings2["query"].append(query_embeddings.cpu().numpy())
 								            if passage_embeddings is not None:
 								                all_embeddings2["passages"].append(passage_embeddings.cpu().numpy())
 								    if all_embeddings2["passages"]:
 								        all_embeddings2["passages"] = np.concatenate(all_embeddings2["passages"])
 								    if all_embeddings2["query"]:
 								        all_embeddings2["query"] = np.concatenate(all_embeddings2["query"])
 								    # compare embeddings of model loaded from model hub and model loaded from disk
 								    assert np.array_equal(all_embeddings["query"][0], all_embeddings2["query"][0])
 								    # save the model that was loaded from disk to disk
-												Pipeline's YAML: syntax validation (#2226)

* Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes

* Make error composition work properly

* Clarify typing

* Help mypy a bit more

* Update Documentation & Code Style

* Enable autogenerated docs for Milvus1 and 2 separately

* Revert "Enable autogenerated docs for Milvus1 and 2 separately"

This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d.

* Update Documentation & Code Style

* Re-enable 'additionalProperties: False'

* Add pipeline.type to JSON Schema, was somehow forgotten

* Disable additionalProperties on the pipeline properties too

* Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future)

* Cal super in PipelineValidationError

* Improve _read_pipeline_config_from_yaml's error handling

* Fix generate_json_schema.py to include document stores

* Fix json schemas (retro-fix 1.1.0 again)

* Improve custom errors printing, add link to docs

* Add function in BaseComponent to list its subclasses in a module

* Make some document stores base classes abstract

* Add marker 'integration' in pytest flags

* Slighly improve validation of pipelines at load

* Adding tests for YAML loading and validation

* Make custom_query Optional for validation issues

* Fix bug in _read_pipeline_config_from_yaml

* Improve error handling in BasePipeline and Pipeline and add DAG check

* Move json schema generation into haystack/nodes/_json_schema.py (useful for tests)

* Simplify errors slightly

* Add some YAML validation tests

* Remove load_from_config from BasePipeline, it was never used anyway

* Improve tests

* Include json-schemas in package

* Fix conftest imports

* Make BasePipeline abstract

* Improve mocking by making the test independent from the YAML version

* Add exportable_to_yaml decorator to forget about set_config on mock nodes

* Fix mypy errors

* Comment out one monkeypatch

* Fix typing again

* Improve error message for validation

* Add required properties to pipelines

* Fix YAML version for REST API YAMLs to 1.2.0

* Fix load_from_yaml call in load_from_deepset_cloud

* fix HaystackError.__getattr__

* Add super().__init__()in most nodes and docstore, comment set_config

* Remove type from REST API pipelines

* Remove useless init from doc2answers

* Call super in Seq3SeqGenerator

* Typo in deepsetcloud.py

* Fix rest api indexing error mismatch and mock version of JSON schema in all tests

* Working on pipeline tests

* Improve errors printing slightly

* Add back test_pipeline.yaml

* _json_schema.py supports different versions with identical schemas

* Add type to 0.7 schema for backwards compatibility

* Fix small bug in _json_schema.py

* Try alternative to generate json schemas on the CI

* Update Documentation & Code Style

* Make linux CI match autoformat CI

* Fix super-init-not-called

* Accidentally committed file

* Update Documentation & Code Style

* fix test_summarizer_translation.py's import

* Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args

* Fix json schema for ray tests too

* Update Documentation & Code Style

* Reintroduce validation

* Usa unstable version in tests and rest api

* Make unstable support the latest versions

* Update Documentation & Code Style

* Remove needless fixture

* Make type in pipeline optional in the strings validation

* Fix schemas

* Fix string validation for pipeline type

* Improve validate_config_strings

* Remove type from test p[ipelines

* Update Documentation & Code Style

* Fix test_pipeline

* Removing more type from pipelines

* Temporary CI patc

* Fix issue with exportable_to_yaml never invoking the wrapped init

* rm stray file

* pipeline tests are green again

* Linux CI now needs .[all] to generate the schema

* Bugfixes, pipeline tests seems to be green

* Typo in version after merge

* Implement missing methods in Weaviate

* Trying to avoid FAISS tests from running in the Milvus1 test suite

* Fix some stray test paths and faiss index dumping

* Fix pytest markers list

* Temporarily disable cache to be able to see tests failures

* Fix pyproject.toml syntax

* Use only tmp_path

* Fix preprocessor signature after merge

* Fix faiss bug

* Fix Ray test

* Fix documentation issue by removing quotes from faiss type

* Update Documentation & Code Style

* use document properly in preprocessor tests

* Update Documentation & Code Style

* make preprocessor capable of handling documents

* import document

* Revert support for documents in preprocessor, do later

* Fix bug in _json_schema.py that was breaking validation

* re-enable cache

* Update Documentation & Code Style

* Simplify calling _json_schema.py from the CI

* Remove redundant ABC inheritance

* Ensure exportable_to_yaml works only on implementations

* Rename subclass to class_ in Meta

* Make run() and get_config() abstract in BasePipeline

* Revert unintended change in preprocessor

* Move outgoing_edges_input_node check inside try block

* Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX

* Add check for a RecursionError on validate_config_strings

* Address usages of _pipeline_config in data silo and elasticsearch

* Rename _pipeline_config into _init_parameters

* Fix pytest marker and remove unused imports

* Remove most redundant ABCs

* Rename _init_parameters into _component_configuration

* Remove set_config and type from _component_configuration's dict

* Remove last instances of set_config and replace with super().__init__()

* Implement __init_subclass__ approach

* Simplify checks on the existence of _component_configuration

* Fix faiss issue

* Dynamic generation of node schemas & weed out old schemas

* Add debatable test

* Add docstring to debatable test

* Positive diff between schemas implemented

* Improve diff printing

* Rename REST API YAML files to trigger IDE validation

* Fix typing issues

* Fix more typing

* Typo in YAML filename

* Remove needless type:ignore

* Add tests

* Fix tests & validation feedback for accessory classes in custom nodes

* Refactor RAGeneratorType out

* Fix broken import in conftest

* Improve source error handling

* Remove unused import in test_eval.py breaking tests

* Fix changed error message in tests matches too

* Normalize generate_openapi_specs.py and generate_json_schema.py in the actions

* Fix path to generate_openapi_specs.py in autoformat.yml

* Update Documentation & Code Style

* Add test for FAISSDocumentStore-like situations (superclass with init params)

* Update Documentation & Code Style

* Fix indentation

* Remove commented set_config

* Store model_name_or_path in FARMReader to use in DistillationDataSilo

* Rename _component_configuration into _component_config

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-15 11:17:26 +01:00
+								    save_dir = f"{tmp_path}/testsave/dpr_model"
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    query_encoder_dir = "query_encoder"
 								    passage_encoder_dir = "passage_encoder"
 								    loaded_model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
 								    loaded_query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
 								    loaded_passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")
 								    # load model from disk
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_tokenizer = get_tokenizer(
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir
 								    )  # tokenizer class is inferred automatically
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								    query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir)
 								    passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)
 								    passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
 								    processor = TextSimilarityProcessor(
 								        query_tokenizer=query_tokenizer,
 								        passage_tokenizer=passage_tokenizer,
 								        max_seq_len_passage=256,
 								        max_seq_len_query=256,
 								        label_list=["hard_negative", "positive"],
 								        metric="text_similarity_metric",
 								        embed_title=True,
 								        num_hard_negatives=0,
 								        num_positives=1,
 								    )
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								    prediction_head = TextSimilarityHead(similarity_function="dot_product")
 								    if torch.cuda.is_available():
 								        device = torch.device("cuda")
 								    else:
 								        device = torch.device("cpu")
 								    model = BiAdaptiveModel(
 								        language_model1=query_encoder,
 								        language_model2=passage_encoder,
 								        prediction_heads=[prediction_head],
 								        embeds_dropout_prob=0.1,
 								        lm1_output_types=["per_sequence"],
 								        lm2_output_types=["per_sequence"],
 								        device=device,
 								    )
 								    model.connect_heads_with_processor(processor.tasks, require_labels=False)
 								    # compare a model loaded from disk that originated from the model hub and was then saved disk with
 								    # a model loaded from disk that also originated from a FARM style model that was saved to disk
 								    dataset3, tensor_names3, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
 								    dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
 								    assert np.array_equal(dataset3.tensors[0], dataset2.tensors[0])
 								    # generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
 								    dataset3, tensor_names3, _, baskets3 = loaded_processor.dataset_from_dicts(
 								        dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
 								    )
 								    data_loader = NamedDataLoader(
 								        dataset=dataset3, sampler=SequentialSampler(dataset3), batch_size=16, tensor_names=tensor_names3
 								    )
 								    all_embeddings3 = {"query": [], "passages": []}
 								    loaded_model.eval()
 								    for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
 								        batch = {key: batch[key].to(device) for key in batch}
 								        # get logits
 								        with torch.no_grad():
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								            query_embeddings, passage_embeddings = loaded_model.forward(
 								                query_input_ids=batch.get("query_input_ids", None),
 								                query_segment_ids=batch.get("query_segment_ids", None),
 								                query_attention_mask=batch.get("query_attention_mask", None),
 								                passage_input_ids=batch.get("passage_input_ids", None),
 								                passage_segment_ids=batch.get("passage_segment_ids", None),
 								                passage_attention_mask=batch.get("passage_attention_mask", None),
 								            )[0]
-												Add testdata, add tests for qa processor, add dpr tests (some failing)


											
										
										
											2021-09-08 12:02:08 +02:00
+								            if query_embeddings is not None:
 								                all_embeddings3["query"].append(query_embeddings.cpu().numpy())
 								            if passage_embeddings is not None:
 								                all_embeddings3["passages"].append(passage_embeddings.cpu().numpy())
 								    if all_embeddings3["passages"]:
 								        all_embeddings3["passages"] = np.concatenate(all_embeddings3["passages"])
 								    if all_embeddings3["query"]:
 								        all_embeddings3["query"] = np.concatenate(all_embeddings3["query"])
 								    # compare embeddings of model loaded from model hub and model loaded from disk that originated from a FARM style
 								    # model that was saved to disk earlier
 								    assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								# TODO fix CI errors (test pass locally or on AWS, next steps: isolate PyTorch versions once FARM dependency is removed)
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								# def test_dpr_training():
 								#     batch_size = 1
 								#     n_epochs = 1
 								#     distributed = False  # enable for multi GPU training via DDP
 								#     evaluate_every = 1
 								#     question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
 								#     passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
 								#     do_lower_case = True
 								#     use_fast = True
 								#     similarity_function = "dot_product"
 								#
 								#
 								#
 								#     device, n_gpu = initialize_device_settings(use_cuda=False)
 								#
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								#     query_tokenizer = get_tokenizer(pretrained_model_name_or_path=question_lang_model,
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								#                                      do_lower_case=do_lower_case, use_fast=use_fast)
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								#     passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_lang_model,
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								#                                        do_lower_case=do_lower_case, use_fast=use_fast)
 								#     label_list = ["hard_negative", "positive"]
 								#
 								#     processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
 								#                                         passage_tokenizer=passage_tokenizer,
 								#                                         max_seq_len_query=10,
 								#                                         max_seq_len_passage=10,
 								#                                         label_list=label_list,
 								#                                         metric="text_similarity_metric",
 								#                                         data_dir="samples/dpr/",
 								#                                         train_filename="sample.json",
 								#                                         dev_filename="sample.json",
 								#                                         test_filename=None,
 								#                                         embed_title=True,
 								#                                         num_hard_negatives=1,
 								#                                         dev_split=0,
 								#                                         max_samples=2)
 								#
 								#     data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
 								#
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								#     question_language_model = get_language_model(pretrained_model_name_or_path=question_lang_model,
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								#                                                  language_model_class="DPRQuestionEncoder")
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								#     passage_language_model = get_language_model(pretrained_model_name_or_path=passage_lang_model,
-												Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2021-09-13 18:38:14 +02:00
+								#                                                 language_model_class="DPRContextEncoder")
 								#
 								#     prediction_head = TextSimilarityHead(similarity_function=similarity_function)
 								#
 								#     model = BiAdaptiveModel(
 								#         language_model1=question_language_model,
 								#         language_model2=passage_language_model,
 								#         prediction_heads=[prediction_head],
 								#         embeds_dropout_prob=0.1,
 								#         lm1_output_types=["per_sequence"],
 								#         lm2_output_types=["per_sequence"],
 								#         device=device,
 								#     )
 								#
 								#     model, optimizer, lr_schedule = initialize_optimizer(
 								#         model=model,
 								#         learning_rate=1e-5,
 								#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
 								#                         "eps": 1e-08},
 								#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
 								#         n_batches=len(data_silo.loaders["train"]),
 								#         n_epochs=n_epochs,
 								#         grad_acc_steps=1,
 								#         device=device,
 								#         distributed=distributed
 								#     )
 								#
 								#     trainer = Trainer(
 								#         model=model,
 								#         optimizer=optimizer,
 								#         data_silo=data_silo,
 								#         epochs=n_epochs,
 								#         n_gpu=n_gpu,
 								#         lr_schedule=lr_schedule,
 								#         evaluate_every=evaluate_every,
 								#         device=device,
 								#     )
 								#
 								#     trainer.train()
 								#
 								#     ######## save and load model again
 								#     save_dir = Path("testsave/dpr-model")
 								#     model.save(save_dir)
 								#     del model
 								#
 								#     model2 = BiAdaptiveModel.load(save_dir, device=device)
 								#     model2, optimizer2, lr_schedule = initialize_optimizer(
 								#         model=model2,
 								#         learning_rate=1e-5,
 								#         optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
 								#                         "eps": 1e-08},
 								#         schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
 								#         n_batches=len(data_silo.loaders["train"]),
 								#         n_epochs=n_epochs,
 								#         grad_acc_steps=1,
 								#         device=device,
 								#         distributed=distributed
 								#     )
 								#     trainer2 = Trainer(
 								#         model=model2,
 								#         optimizer=optimizer,
 								#         data_silo=data_silo,
 								#         epochs=n_epochs,
 								#         n_gpu=n_gpu,
 								#         lr_schedule=lr_schedule,
 								#         evaluate_every=evaluate_every,
 								#         device=device,
 								#     )
 								#
 								#     trainer2.train()