From 06151a809effa6662bbed6f60b015eb1d1a707f5 Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 6 Aug 2024 19:48:30 -0500 Subject: [PATCH] note about logistic sigmoid --- ch06/02_bonus_additional-experiments/previous_chapters.py | 2 +- ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py index 46549e9..a3c89da 100644 --- a/ch06/02_bonus_additional-experiments/previous_chapters.py +++ b/ch06/02_bonus_additional-experiments/previous_chapters.py @@ -11,7 +11,7 @@ import numpy as np import tiktoken import torch import torch.nn as nn -from torch.utils.data import Dataset, DataLoader +from torch.utils.data import Daaset, DataLoader ##################################### # Chapter 2 diff --git a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb index edf87da..d161e17 100644 --- a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb +++ b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb @@ -158,7 +158,7 @@ " - The $\\pi_{\\theta}$ variable is the so-called policy (a term borrowed from reinforcement learning) and represents the LLM we want to optimize; $\\pi_{ref}$ is a reference LLM, which is typically the original LLM before optimization (at the beginning of the training, $\\pi_{\\theta}$ and $\\pi_{ref}$ are typically the same)\n", " - $\\beta$ is a hyperparameter to control the divergence between the $\\pi_{\\theta}$ and the reference model; increasing $\\beta$ increases the impact of the difference between\n", "$\\pi_{\\theta}$ and $\\pi_{ref}$ in terms of their log probabilities on the overall loss function, thereby increasing the divergence between the two models\n", - " - the logistic sigmoid function, $\\log \\sigma(\\centerdot)$ transforms the log-odds of the preferred and rejected responses (the terms inside the logistic sigmoid function) into a log-probability score \n", + " - the logistic sigmoid function, $\\sigma(\\centerdot)$ transforms the log-odds of the preferred and rejected responses (the terms inside the logistic sigmoid function) into a probability score \n", "- To avoid bloating the code notebook with a more detailed discussion, I may write a separate standalone article with more details on these concepts in the future\n", "- In the meantime, if you are interested in comparing RLHF and DPO, please see the section [2.2. RLHF vs Direct Preference Optimization (DPO)](https://magazine.sebastianraschka.com/i/142924793/rlhf-vs-direct-preference-optimization-dpo) in my article [Tips for LLM Pretraining and Evaluating Reward Models](https://magazine.sebastianraschka.com/p/tips-for-llm-pretraining-and-evaluating-rms)" ] @@ -1815,7 +1815,7 @@ " - The $\\pi_{\\theta}$ variable is the so-called policy (a term borrowed from reinforcement learning) and represents the LLM we want to optimize; $\\pi_{ref}$ is a reference LLM, which is typically the original LLM before optimization (at the beginning of the training, $\\pi_{\\theta}$ and $\\pi_{ref}$ are typically the same)\n", " - $\\beta$ is a hyperparameter to control the divergence between the $\\pi_{\\theta}$ and the reference model; increasing $\\beta$ increases the impact of the difference between\n", "$\\pi_{\\theta}$ and $\\pi_{ref}$ in terms of their log probabilities on the overall loss function, thereby increasing the divergence between the two models\n", - " - the logistic sigmoid function, $\\log \\sigma(\\centerdot)$ transforms the log-odds of the preferred and rejected responses (the terms inside the logistic sigmoid function) into a log-probability score \n", + " - the logistic sigmoid function, $\\sigma(\\centerdot)$ transforms the log-odds of the preferred and rejected responses (the terms inside the logistic sigmoid function) into a probability score \n", "- In code, we can implement the DPO loss as follows:" ] },