mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-28 18:40:01 +00:00
Fix qk_norm comment (#769)
This commit is contained in:
parent
b14325e56d
commit
e9c1c1da38
@ -436,7 +436,7 @@
|
|||||||
" \"n_layers\": 28, # Number of layers\n",
|
" \"n_layers\": 28, # Number of layers\n",
|
||||||
" \"hidden_dim\": 3072, # Size of the intermediate dimension in FeedForward\n",
|
" \"hidden_dim\": 3072, # Size of the intermediate dimension in FeedForward\n",
|
||||||
" \"head_dim\": 128, # Size of the heads in GQA\n",
|
" \"head_dim\": 128, # Size of the heads in GQA\n",
|
||||||
" \"qk_norm\": True, # Whether to normalize queries and values in GQA\n",
|
" \"qk_norm\": True, # Whether to normalize queries and keys in GQA\n",
|
||||||
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
||||||
" \"rope_base\": 1_000_000.0, # The base in RoPE's \"theta\"\n",
|
" \"rope_base\": 1_000_000.0, # The base in RoPE's \"theta\"\n",
|
||||||
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
|
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
|
||||||
|
@ -22,7 +22,7 @@ QWEN_CONFIG_06_B = {
|
|||||||
"n_layers": 28, # Number of layers
|
"n_layers": 28, # Number of layers
|
||||||
"hidden_dim": 3072, # Size of the intermediate dimension in FeedForward
|
"hidden_dim": 3072, # Size of the intermediate dimension in FeedForward
|
||||||
"head_dim": 128, # Size of the heads in GQA
|
"head_dim": 128, # Size of the heads in GQA
|
||||||
"qk_norm": True, # Whether to normalize queries and values in GQA
|
"qk_norm": True, # Whether to normalize queries and keys in GQA
|
||||||
"n_kv_groups": 8, # Key-Value groups for grouped-query attention
|
"n_kv_groups": 8, # Key-Value groups for grouped-query attention
|
||||||
"rope_base": 1_000_000.0, # The base in RoPE's "theta"
|
"rope_base": 1_000_000.0, # The base in RoPE's "theta"
|
||||||
"dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
|
"dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
|
||||||
|
Loading…
x
Reference in New Issue
Block a user