Fix eos token usage in Qwen3 tokenizer

This commit is contained in:
rasbt 2025-08-05 13:42:18 -05:00
parent 5febcf8a1b
commit 06aa6d470a
No known key found for this signature in database

View File

@ -536,14 +536,14 @@ class Qwen3Tokenizer:
self._special_to_id = {t: self._tok.token_to_id(t) for t in self._SPECIALS} self._special_to_id = {t: self._tok.token_to_id(t) for t in self._SPECIALS}
self.pad_token_id = self._special_to_id.get("<|endoftext|>") self.pad_token_id = self._special_to_id.get("<|endoftext|>")
self.eos_token_id = self.pad_token_id
if repo_id and "Base" not in repo_id: # Match HF behavior: chat model → <|im_end|>, base model → <|endoftext|>
eos_token = "<|im_end|>" fname = tok_file.name.lower()
if "base" in fname and "reasoning" not in fname:
self.eos_token = "<|endoftext|>"
else: else:
eos_token = "<|endoftext|>" self.eos_token = "<|im_end|>"
if eos_token in self._special_to_id: self.eos_token_id = self._special_to_id.get(self.eos_token)
self.eos_token_id = self._special_to_id[eos_token]
def encode(self, text, chat_wrapped=None): def encode(self, text, chat_wrapped=None):
if chat_wrapped is None: if chat_wrapped is None: