mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-12-25 14:12:12 +00:00
Qwen3 tokenizer sanity checks (#730)
This commit is contained in:
parent
21c41721cc
commit
b8c8237251
@ -279,3 +279,11 @@ def test_tokenizer_equivalence():
|
||||
|
||||
assert tokenizer_ref.eos_token_id == tokenizer.eos_token_id
|
||||
assert tokenizer_ref.pad_token_id == tokenizer.pad_token_id
|
||||
|
||||
assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]]
|
||||
assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]]
|
||||
|
||||
expected_eos_token = "<|im_end|>" if "Base" not in repo_id else "<|endoftext|>"
|
||||
expected_pad_token = "<|endoftext|>"
|
||||
assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token
|
||||
assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user