Improve BPE vocabulary saving and pair frequency handling (#539)

This commit is contained in:
Kasen 2025-02-19 23:51:04 +08:00 committed by GitHub
parent 58aabe7dd8
commit af4b73ca7b

View File

@ -629,7 +629,7 @@
" \"\"\"\n",
" # Save vocabulary\n",
" with open(vocab_path, \"w\", encoding=\"utf-8\") as file:\n",
" json.dump({k: v for k, v in self.vocab.items()}, file, ensure_ascii=False, indent=2)\n",
" json.dump(self.vocab, file, ensure_ascii=False, indent=2)\n",
"\n",
" # Save BPE merges as a list of dictionaries\n",
" with open(bpe_merges_path, \"w\", encoding=\"utf-8\") as file:\n",
@ -667,6 +667,9 @@
" def find_freq_pair(token_ids, mode=\"most\"):\n",
" pairs = Counter(zip(token_ids, token_ids[1:]))\n",
"\n",
" if not pairs:\n",
" return None\n",
"\n",
" if mode == \"most\":\n",
" return max(pairs.items(), key=lambda x: x[1])[0]\n",
" elif mode == \"least\":\n",