2024-06-06 09:15:08 -05:00
{
"cells": [
{
"cell_type": "markdown",
"id": "a9bc1c1a-53bc-4b86-9140-4f1af0128037",
"metadata": {},
"source": [
"<table style=\"width:100%\">\n",
"<tr>\n",
"<td style=\"vertical-align:middle; text-align:left;\">\n",
"<font size=\"2\">\n",
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
"</font>\n",
"</td>\n",
"<td style=\"vertical-align:middle; text-align:left;\">\n",
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
"</td>\n",
"</tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"id": "5250207d-f811-46df-9d16-4ac1e9ce1c66",
"metadata": {},
"source": [
"# Score Correlation Analysis"
]
},
{
"cell_type": "markdown",
"id": "badc7ffb-d51c-4de0-97c5-b54cf3e28315",
"metadata": {},
"source": [
2024-07-02 07:55:32 -05:00
"- This notebook analyses the correlation between the different evaluation method scores"
2024-06-06 09:15:08 -05:00
]
},
{
"cell_type": "code",
2024-07-02 07:55:32 -05:00
"execution_count": 9,
2024-06-06 09:15:08 -05:00
"id": "fa39424b-e058-4351-94ec-249b812ae8fd",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"with open(\"gpt4-model-1-response.json\", \"r\") as file:\n",
" gpt4_model_1 = json.load(file)\n",
"\n",
"with open(\"llama3-8b-model-1-response.json\", \"r\") as file:\n",
2024-07-02 07:55:32 -05:00
" llama3_8b_model_1 = json.load(file)"
2024-06-06 09:15:08 -05:00
]
},
{
"cell_type": "markdown",
"id": "4ef67d30-7602-4695-a190-16209a152621",
"metadata": {},
"source": [
2024-07-02 07:55:32 -05:00
"## GPT-4 vs Llama 3 8B"
2024-06-06 09:15:08 -05:00
]
},
{
"cell_type": "code",
2024-07-02 07:55:32 -05:00
"execution_count": 10,
2024-06-06 09:15:08 -05:00
"id": "2a0d4288-507f-414c-afde-9742935cd8bc",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGwCAYAAABPSaTdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA98ElEQVR4nO3de3gU9cH+/3tz5pBsGpAcJJAVUIigcjYG9VGhUDGtytWWFhSiP1ojqIhHVIxpC8Gnv8daq2BBiVZAqq1YYy3qA1QFA4SjYBTxMRxUklRjsuGQBLLz/SPNliUJbGD2NPt+XVeuNrOfTD6MMLmzM597bIZhGAIAALCoiEBPAAAAwJcIOwAAwNIIOwAAwNIIOwAAwNIIOwAAwNIIOwAAwNIIOwAAwNKiAj2BYOByufT1118rPj5eNpst0NMBAABeMAxDdXV1SktLU0RE++/fEHYkff3110pPTw/0NAAAwBk4cOCAevbs2e7rhB1J8fHxkpoPVkJCQoBnAwAAvOF0OpWenu7+Od4ewo7kvnSVkJBA2AEAIMSc7hYUblAGAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRtgBAACWRoMyAADwiaONTZr3Vpn2fntEGd0666FrM9UpJtLv8wjoOzvvv/++cnJylJaWJpvNptdff93jdcMw9Oijjyo1NVWdOnXS6NGjtWfPHo8x1dXVmjRpkhISEpSYmKhbb71Vhw4d8uOfAgAAnGzan0o14NFVemnDfn2w5xu9tGG/Bjy6StP+VOr3uQQ07Bw+fFgXX3yxnnnmmTZf/+///m899dRTevbZZ7Vx40Z16dJFY8eOVX19vXvMpEmT9PHHH+vdd9/Vm2++qffff1+/+MUv/PVHAAAAJ5n2p1K9W1bV5mvvllX5PfDYDMMw/Pod22Gz2bRy5Updf/31kprf1UlLS9M999yje++9V5JUW1ur5ORkvfDCC5o4caI++eQTZWZmqrS0VMOGDZMkrVq1Stdee62+/PJLpaWlefW9nU6n7Ha7amtreRAoAABn4WhjkwY8uuq04z751bizvqTl7c/voL1Buby8XBUVFRo9erR7m91u18iRI1VSUiJJKikpUWJiojvoSNLo0aMVERGhjRs3trvvhoYGOZ1Ojw8AAHD25r1VZuo4MwRt2KmoqJAkJScne2xPTk52v1ZRUaEePXp4vB4VFaWkpCT3mLYUFhbKbre7P9LT002ePQAA4Wnvt0dMHWeGoA07vjR79mzV1ta6Pw4cOBDoKQEAYAkZ3TqbOs4MQbv0PCUlRZJUWVmp1NRU9/bKykpdcskl7jFVVZ43QB0/flzV1dXur29LbGysYmNjzZ804IUml6FN5dWqqqtXj/g4jXAkKTLCFuhpAUCHNB536aWSvdpXfUS9kzrrpqwMxURF6KFrM/XShv2n/fqHrs30wyybBW3YcTgcSklJ0erVq93hxul0auPGjcrLy5MkZWVlqaamRlu2bNHQoUMlSWvWrJHL5dLIkSMDNXWgXat2HVRBcZkO1v5nRWGqPU75OZkaNzD1FF8JAMGj8K0yLf6gXK4TljjNfesTTbvcodnXZmpMZo92V2NJ0pjMHn7t2wlo2Dl06JA+//xz9+fl5eXavn27kpKS1KtXL82cOVO/+c1v1K9fPzkcDs2ZM0dpaWnuFVsDBgzQuHHjNG3aND377LM6duyYZsyYoYkTJ3q9Egvwl1W7Dipv6VadvPyxorZeeUu3auHkIQQeAEGv8K0y/fH98lbbXYbc208VdLx53WwBvWdn8+bNGjx4sAYPHixJmjVrlgYPHqxHH31UknT//ffrjjvu0C9+8QsNHz5chw4d0qpVqxQXF+fex7Jly9S/f39dc801uvbaazVq1CgtWrQoIH8eoD1NLkMFxWWtgo4k97aC4jI1uYKiCQIA2tR43KXFH7QOOidqKwi1ZcNn35oxJa8ETc9OINGzA18r+b9v9bPFG0477uVplyqrTzc/zAgAOu75D77Qr//+iWn72zt//Fl9fcj37ABWUlVXf/pBHRgHAIGwr9p/y8XNRNgB/KBHfNzpB3VgHAAEQu8k/y0XNxNhB/CDEY4kpdrj1N4Cc5uaV2WNcCT5c1oA0CE3ZWXIrKaMnw9vvyLGbIQdwA8iI2zKz2nulDj5PNHyeX5OJn07AIJaTFSEpl3uOOWYzNR4r/YVFeW/vjvCDuAn4wamauHkIUqxe16qSrHHsewcQMiYfW2mfnmFo9U7PBE26ZdXODRhSE+v9uPPS2KsxhKrseBfNCgDsIL2GpT/5WzQ8Hn/e9qvL31otM5JOLt3d7z9+U3YEWEHAACzTFiwXlv215x23NBeifrr7dln9b1Yeg4AAPzu61rvKjS8HWcGwg4AADCNPdq7C0bejjMDYQcAAJjm8+oGU8eZgbADAABMc9xl7jgzEHYAAIBpor1MFt6OMwNhBwAAmObNGVeYOs4MhB0AAGCa6qONpo4zA2EHAACYpqrOuyXl3o4zA2EHAACYpkd83OkHdWCcGaL89p0AwM94NAfgfyMcSUq1x6mitl5tNenY1PxMwBGOJL/NibADwJJW7TqoguIyHTyhpTXVHqf8nEweugr4UGSETfk5mbpt6dY2Xzck5edk+vUXDy5jAbCcVbsOKm/pVo+gI0kVtfXKW7pVq3YdDNDMgPCw4J+fn9XrZiPsALCUJpehguKyNt8+b9lWUFymJlfYPwMZ8IlD9cf10ZfOU4756EunDtUf99OMCDsALGZTeXWrd3ROZEg6WFuvTeXV/psUEEbu/vM2U8eZgbADwFKCcdkrEE72f3fU1HFmIOwAsJRgXPYKhJNe3+tk6jgzEHYAWErLstf21nnY1Lwqy5/LXoFwYBiGPvy/b9Tk5QM+f/fTwb6d0AlYeg7AUlqWveYt3Sqb5HGjcksA8veyV8DK6o816Y3tX2vJ+nJ9WlHn1ddc1DNBXeP8F0EIOwAsZ9zAVC2cPKRVz04KPTuAaSqd9XqpZJ+Wb9qv6sPNz7nqFB2pG4ecq9e3faXDjU3tfu3eb474a5qSCDsALGrcwFSNyUyhQRkw2fYDNSpaX66/f3RQx/9d4XBuYifdnNVbE4f3UuNxl5Zt3H/KfTjrj+tfzgadkxDrjykTdgBYV2SETVl9ugV6GkDIO9bk0qpdFSpaX66t+2vc24dnfE+52Q59PzNZUZHNtwGPmr/aq33esGCd1j14jS+m2wphBwAAtOm7w416uXS/XirZ574kHB1pU85FacrNdmhQT3urr6k+fMyrfXs7zgyEHQAA4OGzyjoVrS/Xym1fqf5Y8/Kq7l1jNGlkb026tNcpqxuSukTrSE379+ucOM5fCDsAAEAul6G1u6tUtH6v1n3+jXv7hWkJys12KOfiVMVGRZ52PytvH6Xh8/7Xq3H+QtgBACCMHWo4rr9sPqAXS/ap/JvDkqQIm/T9zBTlZmdohCNJNpv3N/afkxCrhLgoOU/x7KuEuCi/3ZwsEXYAAAhL+789ohdL9uqV0gOqa2gOJvFxUZo4PF03Z2UoPanzGe/7o8fG6qLH3m4z8CTERemjx8ae8b7PBGHHR5pcBkteAQBBxTAMbfiiWkvWl+t/P6mU8e/WzfO6d9HU7AxNGNJTXWLNiQYfPTZW/3I26IYF61R9+JiSukRr5e2j/PqOTgvCjg+s2nWwVZlZKmVmAIAAaa/l+Irzz1Fudoau7HeOInzwC/k5CbF+W15+KoQdk63adVB5S7d6VNRLUkVtvfKWbtXCyUMIPAAAv6h01mvphn1avnG/vj2p5Tg3O0N9e8QHeIb+QdgxUZPLUEFxWaugIzU/n8cmqaC4TGMyU7ikBQDwmR3/bjl+84SW4zR7nG6+LEMTh6crsXNMgGfoX4QdE20qr/a4dHUyQ9LB2nptKq+m1RUAYKpjTS69/XGFlqw7fctxuCHsmKiqrv2gcybjAAA4nTNpOQ43hB0TnapR8kzGAQDQnuaW471aue1Lj5bjn4/srcmnaTkON4QdE41wJCnVHqeK2vo279uxSUqxNy9DhzlY4g8gnLhchv75WZW
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"list1, list2 = gpt4_model_1, llama3_8b_model_1\n",
"\n",
"plt.scatter(list1, list2)\n",
"plt.plot(\n",
" np.unique(list1),\n",
" np.poly1d(np.polyfit(list1, list2, 1))(np.unique(list1))\n",
")\n",
"plt.xlabel(\"GPT-4\")\n",
"plt.ylabel(\"Llama3 8B\")\n",
"plt.show()"
]
2024-07-02 07:55:32 -05:00
},
{
"cell_type": "markdown",
"id": "3e28b2d2-7f31-4c5f-853b-1e71dc715a25",
"metadata": {},
"source": [
"### Correlation Coefficients"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "44ef7e9a-1f07-4e94-bdc5-d5271616ef6d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pearson</th>\n",
" <th>Spearman</th>\n",
" <th>Kendall Tau</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Results</th>\n",
" <td>0.80489</td>\n",
" <td>0.698406</td>\n",
" <td>0.57292</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pearson Spearman Kendall Tau\n",
"Results 0.80489 0.698406 0.57292"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from scipy.stats import spearmanr, kendalltau\n",
"\n",
"pearson_correlation = np.corrcoef(list1, list2)[0, 1]\n",
"spearman_correlation, _ = spearmanr(list1, list2)\n",
"kendall_tau_correlation, _ = kendalltau(list1, list2)\n",
"\n",
"correlation_table = pd.DataFrame({\n",
" \"Pearson\": [pearson_correlation],\n",
" \"Spearman\": [spearman_correlation],\n",
" \"Kendall Tau\": [kendall_tau_correlation]\n",
"}, index=['Results'])\n",
"\n",
"correlation_table"
]
},
{
"cell_type": "markdown",
"id": "3a1bd708-ba5d-4290-abe3-ee736059c2cd",
"metadata": {},
"source": [
"- For comparison, below are the correlation coefficients from the Prometheus 2 paper by Kim et al. 2024 ([https://arxiv.org/abs/2405.01535](https://arxiv.org/abs/2405.01535)), which are all in the same ballpark as the ones reported for Llama 3 above\n",
"- Note that Prometheus 2 is a model specifically finetuned for LLM rating and evaluation "
]
},
{
"cell_type": "markdown",
"id": "fbc033f4-8a11-42be-a683-6cef7eb23468",
"metadata": {},
"source": [
"#### Pearson\n",
"\n",
"| Evaluator LM | VICUNA Bench | VICUNA Bench | MT Bench | MT Bench | FLASK | FLASK | FLASK | Feedback Bench |\n",
"|-----------------------|--------------|---------------|------------|---------------|------------|---------------|-----------|----------------|\n",
"| | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | Humans | GPT-4-0613 |\n",
"| LLAMA2-CHAT 7B | 0.205 | 0.243 | 0.036 | 0.055 | 0.317 | 0.256 | 0.299 | 0.523 |\n",
"| LLAMA2-CHAT 13B | 0.185 | 0.141 | -0.042 | -0.002 | 0.239 | 0.247 | 0.263 | 0.545 |\n",
"| LLAMA2-CHAT 70B | 0.350 | 0.463 | 0.178 | 0.228 | 0.388 | 0.402 | 0.317 | 0.592 |\n",
"| MISTRAL-INSTRUCT-7B | 0.486 | 0.561 | 0.284 | 0.396 | 0.448 | 0.437 | 0.377 | 0.586 |\n",
"| MIXTRAL-INSTRUCT-8X7B | 0.566 | 0.579 | 0.551 | 0.539 | 0.483 | 0.495 | 0.420 | 0.673 |\n",
"| **PROMETHEUS-7B** | **0.484** | **0.528** | **0.378** | **0.382** | **0.352** | **0.331** | **0.348** | **0.847** |\n",
"| **PROMETHEUS-13B** | **0.492** | **0.534** | **0.404** | **0.477** | **0.462** | **0.470** | **0.449** | **0.860** |\n",
"| AUTO-J (13B) | 0.351 | 0.262 | 0.432 | 0.375 | 0.430 | 0.370 | 0.473 | 0.637 |\n",
"| **PROMETHEUS-2-7B** | **0.642** | **0.610** | **0.543** | **0.554** | **0.645** | **0.578** | **0.544** | **0.878** |\n",
"| **PROMETHEUS-2-8X7B** | **0.685** | **0.635** | **0.665** | **0.614** | **0.659** | **0.626** | **0.555** | **0.898** |\n",
"| GPT-3.5-TURBO-0613 | 0.335 | 0.349 | 0.183 | 0.194 | 0.437 | 0.396 | 0.450 | 0.594 |\n",
"| GPT-4-1106 | / | 0.694 | / | 0.717 | / | 0.736 | 0.679 | 0.753 |\n",
"| CLAUDE-3-OPUS | 0.694 | / | 0.717 | / | 0.736 | / | 0.573 | 0.788 |\n",
"\n",
"#### Spearman\n",
"\n",
"| Evaluator LM | VICUNA Bench | VICUNA Bench | MT Bench | MT Bench | MT Bench | FLASK | FLASK | Feedback Bench |\n",
"|-----------------------|--------------|---------------|------------|---------------|------------|---------------|-----------|----------------|\n",
"| | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | Humans | GPT-4-0613 |\n",
"| LLAMA2-CHAT 7B | 0.236 | 0.255 | 0.084 | 0.089 | 0.301 | 0.244 | 0.279 | 0.511 |\n",
"| LLAMA2-CHAT 13B | 0.178 | 0.179 | -0.025 | 0.044 | 0.206 | 0.222 | 0.224 | 0.543 |\n",
"| LLAMA2-CHAT 70B | 0.348 | 0.466 | 0.197 | 0.252 | 0.391 | 0.389 | 0.298 | 0.585 |\n",
"| MISTRAL-INSTRUCT-7B | 0.389 | 0.480 | 0.266 | 0.358 | 0.499 | 0.478 | 0.374 | 0.563 |\n",
"| MIXTRAL-INSTRUCT-8X7B | 0.476 | 0.556 | 0.545 | 0.517 | 0.505 | 0.500 | 0.386 | 0.659 |\n",
"| **PROMETHEUS-7B** | **0.508** | **0.528** | **0.385** | **0.349** | **0.367** | **0.326** | **0.317** | **0.876** |\n",
"| **PROMETHEUS-13B** | **0.492** | **0.534** | **0.401** | **0.470** | **0.474** | **0.454** | **0.398** | **0.893** |\n",
"| AUTO-J (13B) | 0.337 | 0.297 | 0.408 | 0.365 | 0.402 | 0.358 | 0.408 | 0.623 |\n",
"| **PROMETHEUS-2-7B** | **0.643** | **0.584** | **0.550** | **0.524** | **0.626** | **0.569** | **0.490** | **0.909** |\n",
"| **PROMETHEUS-2-8X7B** | **0.660** | **0.615** | **0.669** | **0.605** | **0.642** | **0.618** | **0.496** | **0.912** |\n",
"| GPT-3.5-TURBO-0613 | 0.319 | 0.354 | 0.192 | 0.198 | 0.446 | 0.390 | 0.374 | 0.565 |\n",
"| GPT-4-1106 | / | 0.659 | / | 0.721 | / | 0.729 | 0.650 | 0.753 |\n",
"| CLAUDE-3-OPUS | 0.659 | / | 0.721 | / | 0.729 | / | 0.567 | 0.784 |\n",
"\n",
"#### Kendall-Tau\n",
"\n",
"| Evaluator LM | VICUNA Bench | VICUNA Bench | MT Bench | MT Bench | FLASK | FLASK | FLASK | Feedback Bench |\n",
"|-----------------------|--------------|---------------|------------|---------------|------------|---------------|-----------|----------------|\n",
"| | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | GPT-4-1106 | Claude-3-Opus | Humans | GPT-4-0613 |\n",
"| LLAMA2-CHAT 7B | 0.183 | 0.203 | 0.065 | 0.070 | 0.229 | 0.186 | 0.211 | 0.419 |\n",
"| LLAMA2-CHAT 13B | 0.145 | 0.146 | -0.019 | 0.037 | 0.160 | 0.174 | 0.174 | 0.453 |\n",
"| LLAMA2-CHAT 70B | 0.282 | 0.382 | 0.150 | 0.196 | 0.310 | 0.310 | 0.221 | 0.487 |\n",
"| MISTRAL-INSTRUCT-7B | 0.314 | 0.391 | 0.208 | 0.281 | 0.395 | 0.384 | 0.287 | 0.454 |\n",
"| MIXTRAL-INSTRUCT-8X7B | 0.395 | 0.468 | 0.433 | 0.419 | 0.410 | 0.408 | 0.304 | 0.551 |\n",
"| **PROMETHEUS-7B** | **0.405** | **0.425** | **0.290** | **0.263** | **0.282** | **0.251** | **0.236** | **0.770** |\n",
"| **PROMETHEUS-13B** | **0.397** | **0.434** | **0.299** | **0.352** | **0.365** | **0.352** | **0.299** | **0.793** |\n",
"| AUTO-J (13B) | 0.282 | 0.242 | 0.303 | 0.272 | 0.312 | 0.282 | 0.312 | 0.515 |\n",
"| **PROMETHEUS-2-7B** | **0.515** | **0.478** | **0.458** | **0.421** | **0.500** | **0.454** | **0.376** | **0.773** |\n",
"| **PROMETHEUS-2-8X7B** | **0.559** | **0.515** | **0.535** | **0.483** | **0.526** | **0.507** | **0.388** | **0.800** |\n",
"| GPT-3.5-TURBO-0613 | 0.255 | 0.287 | 0.148 | 0.157 | 0.360 | 0.315 | 0.298 | 0.489 |\n",
"| GPT-4-1106 | / | 0.553 | / | 0.590 | / | 0.609 | 0.517 | 0.662 |\n",
"| CLAUDE-3-OPUS | 0.553 | / | 0.590 | / | 0.609 | / | 0.453 | 0.693 |"
]
2024-06-06 09:15:08 -05:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}