haystack/farm_haystack/reader/farm.py

from farm.infer import Inferencer
import numpy as np
from scipy.special import expit


class FARMReader:
    """
    Implementation of FARM Inferencer for Question Answering.

    The class loads a saved FARM adaptive model from a given directory and runs
    inference using `inference_from_dicts()` method.
    """

    def __init__(
        self,
        model_dir,
        context_size=30,
        no_answer_shift=-100,
        batch_size=16,
        use_gpu=True,
    ):
        """
        Load a saved FARM model in Inference mode.

        :param model_dir: directory path of the saved model
        """
        self.model = Inferencer.load(model_dir, batch_size=batch_size, gpu=use_gpu)
        self.model.model.prediction_heads[0].context_size = context_size
        self.model.model.prediction_heads[0].no_answer_shift = no_answer_shift

    def predict(self, input_dicts, top_k=None):
        """
        Run inference on the loaded model for the given input dicts.

        :param input_dicts: list of input dicts
        :param top_k: the maximum number of answers to return
        :return:
        """
        results = self.model.inference_from_dicts(
            dicts=input_dicts, rest_api_schema=True, use_multiprocessing=False
        )

        # The FARM Inferencer as of now do not support multi document QA.
        # The QA inference is done for each text independently and the
        # results are sorted descending by their `score`.

        all_predictions = []
        for res in results:
            all_predictions.extend(res["predictions"])

        all_answers = []
        for pred in all_predictions:
            answers = pred["answers"]
            for a in answers:
                # Two sets of offset fields are returned by FARM -- context level and document level.
                # For the API, only context level offsets are relevant.
                a["offset_start"] = a["offset_answer_start"] - a["offset_context_start"]
                a["offset_end"] = a["offset_context_end"] - a["offset_answer_end"]
            all_answers.extend(answers)

        # remove all null answers (where an answers in not found in the text)
        all_answers = [ans for ans in all_answers if ans["answer"]]

        scores = np.asarray([ans["score"] for ans in all_answers])
        probabilities = expit(scores / 8)
        for ans, prob in zip(all_answers, probabilities):
            ans["probability"] = prob

        # sort answers by their `probability`
        sorted_answers = sorted(
            all_answers, key=lambda k: k["probability"], reverse=True
        )

        # all predictions here are for the same questions, so the the metadata from
        # the first prediction in the list is taken.
        if all_predictions:
            resp = all_predictions[0]  # get the first prediction dict
            resp["answers"] = sorted_answers[:top_k]
        else:
            resp = []

        return {"results": [resp]}