FlagEmbedding/Tutorials/7_Fine-tuning/7.1.3_Eval_FT_Model.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluate the Fine-tuned Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the previous sections, we prepared the dataset and fine-tuned the model. In this tutorial, we will go through how to evaluate the model with the test dataset we constructed."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0. Installation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "% pip install -U datasets pytrec_eval FlagEmbedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We first load data from the files we processed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "queries = load_dataset(\"json\", data_files=\"ft_data/test_queries.jsonl\")[\"train\"]\n",
    "corpus = load_dataset(\"json\", data_files=\"ft_data/corpus.jsonl\")[\"train\"]\n",
    "qrels = load_dataset(\"json\", data_files=\"ft_data/test_qrels.jsonl\")[\"train\"]\n",
    "\n",
    "queries_text = queries[\"text\"]\n",
    "corpus_text = [text for sub in corpus[\"text\"] for text in sub]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "qrels_dict = {}\n",
    "for line in qrels:\n",
    "    if line['qid'] not in qrels_dict:\n",
    "        qrels_dict[line['qid']] = {}\n",
    "    qrels_dict[line['qid']][line['docid']] = line['relevance']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then we prepare a function to encode the text into embeddings and search the results:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import faiss\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "def search(model, queries_text, corpus_text):\n",
    "    \n",
    "    queries_embeddings = model.encode_queries(queries_text)\n",
    "    corpus_embeddings = model.encode_corpus(corpus_text)\n",
    "    \n",
    "    # create and store the embeddings in a Faiss index\n",
    "    dim = corpus_embeddings.shape[-1]\n",
    "    index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n",
    "    corpus_embeddings = corpus_embeddings.astype(np.float32)\n",
    "    index.train(corpus_embeddings)\n",
    "    index.add(corpus_embeddings)\n",
    "    \n",
    "    query_size = len(queries_embeddings)\n",
    "\n",
    "    all_scores = []\n",
    "    all_indices = []\n",
    "\n",
    "    # search top 100 answers for all the queries\n",
    "    for i in tqdm(range(0, query_size, 32), desc=\"Searching\"):\n",
    "        j = min(i + 32, query_size)\n",
    "        query_embedding = queries_embeddings[i: j]\n",
    "        score, indice = index.search(query_embedding.astype(np.float32), k=100)\n",
    "        all_scores.append(score)\n",
    "        all_indices.append(indice)\n",
    "\n",
    "    all_scores = np.concatenate(all_scores, axis=0)\n",
    "    all_indices = np.concatenate(all_indices, axis=0)\n",
    "    \n",
    "    # store the results into the format for evaluation\n",
    "    results = {}\n",
    "    for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):\n",
    "        results[queries[\"id\"][idx]] = {}\n",
    "        for score, index in zip(scores, indices):\n",
    "            if index != -1:\n",
    "                results[queries[\"id\"][idx]][corpus[\"id\"][index]] = float(score)\n",
    "                \n",
    "    return results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr\n",
    "from FlagEmbedding import FlagModel\n",
    "\n",
    "k_values = [10,100]\n",
    "\n",
    "raw_name = \"BAAI/bge-large-en-v1.5\"\n",
    "finetuned_path = \"test_encoder_only_base_bge-large-en-v1.5\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The result for the original model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 129.75it/s]\n",
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
      "Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00, 11.08it/s]\n",
      "pre tokenize: 100%|██████████| 28/28 [00:00<00:00, 164.29it/s]\n",
      "Inference Embeddings: 100%|██████████| 28/28 [00:04<00:00,  6.09it/s]\n",
      "Searching: 100%|██████████| 22/22 [00:08<00:00,  2.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "defaultdict(<class 'list'>, {'NDCG@10': 0.70405, 'NDCG@100': 0.73528})\n",
      "defaultdict(<class 'list'>, {'MAP@10': 0.666, 'MAP@100': 0.67213})\n",
      "defaultdict(<class 'list'>, {'Recall@10': 0.82286, 'Recall@100': 0.97286})\n",
      "defaultdict(<class 'list'>, {'P@10': 0.08229, 'P@100': 0.00973})\n",
      "defaultdict(<class 'list'>, {'MRR@10': 0.666, 'MRR@100': 0.67213})\n"
     ]
    }
   ],
   "source": [
    "raw_model = FlagModel(\n",
    "    raw_name, \n",
    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
    "    devices=[0],\n",
    "    use_fp16=False\n",
    ")\n",
    "\n",
    "results = search(raw_model, queries_text, corpus_text)\n",
    "\n",
    "eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",
    "mrr = evaluate_mrr(qrels_dict, results, k_values)\n",
    "\n",
    "for res in eval_res:\n",
    "    print(res)\n",
    "print(mrr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then the result for the model after fine-tuning:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 164.72it/s]\n",
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
      "Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00,  9.45it/s]\n",
      "pre tokenize: 100%|██████████| 28/28 [00:00<00:00, 160.19it/s]\n",
      "Inference Embeddings: 100%|██████████| 28/28 [00:04<00:00,  6.06it/s]\n",
      "Searching: 100%|██████████| 22/22 [00:07<00:00,  2.80it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "defaultdict(<class 'list'>, {'NDCG@10': 0.84392, 'NDCG@100': 0.85792})\n",
      "defaultdict(<class 'list'>, {'MAP@10': 0.81562, 'MAP@100': 0.81875})\n",
      "defaultdict(<class 'list'>, {'Recall@10': 0.93143, 'Recall@100': 0.99429})\n",
      "defaultdict(<class 'list'>, {'P@10': 0.09314, 'P@100': 0.00994})\n",
      "defaultdict(<class 'list'>, {'MRR@10': 0.81562, 'MRR@100': 0.81875})\n"
     ]
    }
   ],
   "source": [
    "ft_model = FlagModel(\n",
    "    finetuned_path, \n",
    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
    "    devices=[0],\n",
    "    use_fp16=False\n",
    ")\n",
    "\n",
    "results = search(ft_model, queries_text, corpus_text)\n",
    "\n",
    "eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",
    "mrr = evaluate_mrr(qrels_dict, results, k_values)\n",
    "\n",
    "for res in eval_res:\n",
    "    print(res)\n",
    "print(mrr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see an obvious improvement in all the metrics."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ft",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
update tutorials 2024-12-27 09:19:24 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# Evaluate the Fine-tuned Model"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"In the previous sections, we prepared the dataset and fine-tuned the model. In this tutorial, we will go through how to evaluate the model with the test dataset we constructed."`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## 0. Installation"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"% pip install -U datasets pytrec_eval FlagEmbedding"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## 1. Load Data"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"We first load data from the files we processed."`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from datasets import load_dataset\n",`
			`"\n",`
			`"queries = load_dataset(\"json\", data_files=\"ft_data/test_queries.jsonl\")[\"train\"]\n",`
			`"corpus = load_dataset(\"json\", data_files=\"ft_data/corpus.jsonl\")[\"train\"]\n",`
			`"qrels = load_dataset(\"json\", data_files=\"ft_data/test_qrels.jsonl\")[\"train\"]\n",`
			`"\n",`
			`"queries_text = queries[\"text\"]\n",`
			`"corpus_text = [text for sub in corpus[\"text\"] for text in sub]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"qrels_dict = {}\n",`
			`"for line in qrels:\n",`
			`" if line['qid'] not in qrels_dict:\n",`
			`" qrels_dict[line['qid']] = {}\n",`
			`" qrels_dict[line['qid']][line['docid']] = line['relevance']"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## 2. Search"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Then we prepare a function to encode the text into embeddings and search the results:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import faiss\n",`
			`"import numpy as np\n",`
			`"from tqdm import tqdm\n",`
			`"\n",`
			`"\n",`
			`"def search(model, queries_text, corpus_text):\n",`
			`" \n",`
			`" queries_embeddings = model.encode_queries(queries_text)\n",`
			`" corpus_embeddings = model.encode_corpus(corpus_text)\n",`
			`" \n",`
			`" # create and store the embeddings in a Faiss index\n",`
			`" dim = corpus_embeddings.shape[-1]\n",`
			`" index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n",`
			`" corpus_embeddings = corpus_embeddings.astype(np.float32)\n",`
			`" index.train(corpus_embeddings)\n",`
			`" index.add(corpus_embeddings)\n",`
			`" \n",`
			`" query_size = len(queries_embeddings)\n",`
			`"\n",`
			`" all_scores = []\n",`
			`" all_indices = []\n",`
			`"\n",`
			`" # search top 100 answers for all the queries\n",`
			`" for i in tqdm(range(0, query_size, 32), desc=\"Searching\"):\n",`
			`" j = min(i + 32, query_size)\n",`
			`" query_embedding = queries_embeddings[i: j]\n",`
			`" score, indice = index.search(query_embedding.astype(np.float32), k=100)\n",`
			`" all_scores.append(score)\n",`
			`" all_indices.append(indice)\n",`
			`"\n",`
			`" all_scores = np.concatenate(all_scores, axis=0)\n",`
			`" all_indices = np.concatenate(all_indices, axis=0)\n",`
			`" \n",`
			`" # store the results into the format for evaluation\n",`
			`" results = {}\n",`
			`" for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):\n",`
			`" results[queries[\"id\"][idx]] = {}\n",`
			`" for score, index in zip(scores, indices):\n",`
			`" if index != -1:\n",`
			`" results[queries[\"id\"][idx]][corpus[\"id\"][index]] = float(score)\n",`
			`" \n",`
			`" return results"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## 3. Evaluation"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr\n",`
			`"from FlagEmbedding import FlagModel\n",`
			`"\n",`
			`"k_values = [10,100]\n",`
			`"\n",`
			`"raw_name = \"BAAI/bge-large-en-v1.5\"\n",`
			`"finetuned_path = \"test_encoder_only_base_bge-large-en-v1.5\""`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"The result for the original model:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"pre tokenize: 100%\|██████████\| 3/3 [00:00<00:00, 129.75it/s]\n",`
			"You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
			`"Inference Embeddings: 100%\|██████████\| 3/3 [00:00<00:00, 11.08it/s]\n",`
			`"pre tokenize: 100%\|██████████\| 28/28 [00:00<00:00, 164.29it/s]\n",`
			`"Inference Embeddings: 100%\|██████████\| 28/28 [00:04<00:00, 6.09it/s]\n",`
			`"Searching: 100%\|██████████\| 22/22 [00:08<00:00, 2.56it/s]\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"defaultdict(<class 'list'>, {'NDCG@10': 0.70405, 'NDCG@100': 0.73528})\n",`
			`"defaultdict(<class 'list'>, {'MAP@10': 0.666, 'MAP@100': 0.67213})\n",`
			`"defaultdict(<class 'list'>, {'Recall@10': 0.82286, 'Recall@100': 0.97286})\n",`
			`"defaultdict(<class 'list'>, {'P@10': 0.08229, 'P@100': 0.00973})\n",`
			`"defaultdict(<class 'list'>, {'MRR@10': 0.666, 'MRR@100': 0.67213})\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"raw_model = FlagModel(\n",`
			`" raw_name, \n",`
			`" query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",`
			`" devices=[0],\n",`
			`" use_fp16=False\n",`
			`")\n",`
			`"\n",`
			`"results = search(raw_model, queries_text, corpus_text)\n",`
			`"\n",`
			`"eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",`
			`"mrr = evaluate_mrr(qrels_dict, results, k_values)\n",`
			`"\n",`
			`"for res in eval_res:\n",`
			`" print(res)\n",`
			`"print(mrr)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Then the result for the model after fine-tuning:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 8,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"pre tokenize: 100%\|██████████\| 3/3 [00:00<00:00, 164.72it/s]\n",`
			"You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
			`"Inference Embeddings: 100%\|██████████\| 3/3 [00:00<00:00, 9.45it/s]\n",`
			`"pre tokenize: 100%\|██████████\| 28/28 [00:00<00:00, 160.19it/s]\n",`
			`"Inference Embeddings: 100%\|██████████\| 28/28 [00:04<00:00, 6.06it/s]\n",`
			`"Searching: 100%\|██████████\| 22/22 [00:07<00:00, 2.80it/s]\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"defaultdict(<class 'list'>, {'NDCG@10': 0.84392, 'NDCG@100': 0.85792})\n",`
			`"defaultdict(<class 'list'>, {'MAP@10': 0.81562, 'MAP@100': 0.81875})\n",`
			`"defaultdict(<class 'list'>, {'Recall@10': 0.93143, 'Recall@100': 0.99429})\n",`
			`"defaultdict(<class 'list'>, {'P@10': 0.09314, 'P@100': 0.00994})\n",`
			`"defaultdict(<class 'list'>, {'MRR@10': 0.81562, 'MRR@100': 0.81875})\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"ft_model = FlagModel(\n",`
			`" finetuned_path, \n",`
			`" query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",`
			`" devices=[0],\n",`
			`" use_fp16=False\n",`
			`")\n",`
			`"\n",`
			`"results = search(ft_model, queries_text, corpus_text)\n",`
			`"\n",`
			`"eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",`
			`"mrr = evaluate_mrr(qrels_dict, results, k_values)\n",`
			`"\n",`
			`"for res in eval_res:\n",`
			`" print(res)\n",`
			`"print(mrr)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"We can see an obvious improvement in all the metrics."`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "ft",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.10"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`