diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index 0feaced..ae4a9ef 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -453,7 +453,7 @@ " x2 = x[..., head_dim // 2 :] # Second half\n", "\n", " # Adjust sin and cos shapes\n", - " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim)\n", + " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2)\n", " sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)\n", "\n", " # Apply the rotary transformation\n", diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb index afb27c2..6f7ca60 100644 --- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb +++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb @@ -202,7 +202,7 @@ " x2 = x[..., head_dim // 2 :] # Second half\n", "\n", " # Adjust sin and cos shapes\n", - " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim)\n", + " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2)\n", " sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)\n", "\n", " # Apply the rotary transformation\n", diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb index 9dcff2f..a6126d5 100644 --- a/ch05/11_qwen3/standalone-qwen3.ipynb +++ b/ch05/11_qwen3/standalone-qwen3.ipynb @@ -226,7 +226,7 @@ " x2 = x[..., head_dim // 2 :] # Second half\n", "\n", " # Adjust sin and cos shapes\n", - " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim)\n", + " cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2)\n", " sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)\n", "\n", " # Apply the rotary transformation\n", @@ -1201,7 +1201,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/pkg/llms_from_scratch/kv_cache/llama3.py b/pkg/llms_from_scratch/kv_cache/llama3.py index 70258d0..74cabdd 100644 --- a/pkg/llms_from_scratch/kv_cache/llama3.py +++ b/pkg/llms_from_scratch/kv_cache/llama3.py @@ -292,7 +292,7 @@ def apply_rope(x, cos, sin, offset=0): x2 = x[..., head_dim // 2:] # Second half # Adjust sin and cos shapes - cos = cos[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim) + cos = cos[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2) sin = sin[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Apply the rotary transformation diff --git a/pkg/llms_from_scratch/kv_cache/qwen3.py b/pkg/llms_from_scratch/kv_cache/qwen3.py index cb60112..4d842d9 100644 --- a/pkg/llms_from_scratch/kv_cache/qwen3.py +++ b/pkg/llms_from_scratch/kv_cache/qwen3.py @@ -236,7 +236,7 @@ def apply_rope(x, cos, sin, offset=0): x2 = x[..., head_dim // 2:] # Second half # Adjust sin and cos shapes - cos = cos[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim) + cos = cos[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2) sin = sin[offset:offset + seq_len, :].unsqueeze(0).unsqueeze(0) # Apply the rotary transformation diff --git a/pkg/llms_from_scratch/llama3.py b/pkg/llms_from_scratch/llama3.py index ddd4cde..585c174 100644 --- a/pkg/llms_from_scratch/llama3.py +++ b/pkg/llms_from_scratch/llama3.py @@ -260,7 +260,7 @@ def apply_rope(x, cos, sin): x2 = x[..., head_dim // 2:] # Second half # Adjust sin and cos shapes - cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim) + cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2) sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0) # Apply the rotary transformation diff --git a/pkg/llms_from_scratch/qwen3.py b/pkg/llms_from_scratch/qwen3.py index 33cf047..71a1e3b 100644 --- a/pkg/llms_from_scratch/qwen3.py +++ b/pkg/llms_from_scratch/qwen3.py @@ -288,7 +288,7 @@ def apply_rope(x, cos, sin): x2 = x[..., head_dim // 2:] # Second half # Adjust sin and cos shapes - cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim) + cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim // 2) sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0) # Apply the rotary transformation