From 2b24a7ef308eba651bca22f17ac3d912ed01b19c Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Sat, 26 Oct 2024 04:08:06 +0200 Subject: [PATCH] minor fixes: Llama 3.2 standalone (#420) * minor fixes * reformat rope base as float --------- Co-authored-by: rasbt --- ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb | 10 +++++----- ch05/07_gpt_to_llama/standalone-llama32.ipynb | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index 431c8d3..eac9d58 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -907,7 +907,7 @@ " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 14_336, # NEW: Larger size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # NEW: Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n", + " \"rope_base\": 500_000.0, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n", " \"rope_freq\": None, # NEW: Additional configuration for adjusting the RoPE frequencies\n", " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n", "}" @@ -2060,7 +2060,7 @@ " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"rope_freq\": None, # Additional configuration for adjusting the RoPE frequencies\n", " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n", "}\n", @@ -2073,7 +2073,7 @@ " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", " \"rope_freq\": { # NEW: RoPE frequency scaling\n", " \"factor\": 8.0,\n", @@ -2447,7 +2447,7 @@ " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", " \"rope_freq\": { # NEW: RoPE frequency scaling\n", " \"factor\": 8.0,\n", @@ -2466,7 +2466,7 @@ " \"n_layers\": 16, # NEW: Half the number of layers\n", " \"hidden_dim\": 8192, # NEW: Almost half the size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", " \"rope_freq\": { # RoPE frequency scaling\n", " \"factor\": 32.0, # NEW: Adjustment of the rescaling factor\n", diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb index 8dfb940..e4c94c4 100644 --- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb +++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb @@ -437,7 +437,7 @@ " \"n_layers\": 16, # Number of layers\n", " \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", " \"rope_freq\": { # RoPE frequency scaling\n", " \"factor\": 32.0,\n", @@ -451,13 +451,13 @@ "\n", "# LLAMA32_CONFIG = {\n", "# \"vocab_size\": 128_256, # Vocabulary size\n", - "# \"context_length\": 131_000, # Context length\n", + "# \"context_length\": 131_072, # Context length\n", "# \"emb_dim\": 3072, # Embedding dimension\n", "# \"n_heads\": 24, # Number of attention heads\n", "# \"n_layers\": 28, # Number of layers\n", "# \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n", "# \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", - "# \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", + "# \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", "# \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", "# \"rope_freq\": { # RoPE frequency scaling\n", "# \"factor\": 32.0,\n", @@ -697,7 +697,6 @@ " def __init__(self, model_path):\n", " assert os.path.isfile(model_path), f\"Model file {model_path} not found\"\n", " mergeable_ranks = load_tiktoken_bpe(model_path)\n", - " num_base_tokens = len(mergeable_ranks)\n", "\n", " self.special_tokens = {\n", " \"<|begin_of_text|>\": 128000,\n", @@ -1013,7 +1012,8 @@ "\n", "\n", "load_weights_into_llama(model, LLAMA32_CONFIG, combined_weights)\n", - "model.to(device);" + "model.to(device)\n", + "del combined_weights # free up memory" ] }, {