From af4b73ca7ba70a49f44ea47aaa5da6284deb7c0e Mon Sep 17 00:00:00 2001 From: Kasen <18170166+imkasen@users.noreply.github.com> Date: Wed, 19 Feb 2025 23:51:04 +0800 Subject: [PATCH] Improve BPE vocabulary saving and pair frequency handling (#539) --- ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index 16ce548..859cc78 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -629,7 +629,7 @@ " \"\"\"\n", " # Save vocabulary\n", " with open(vocab_path, \"w\", encoding=\"utf-8\") as file:\n", - " json.dump({k: v for k, v in self.vocab.items()}, file, ensure_ascii=False, indent=2)\n", + " json.dump(self.vocab, file, ensure_ascii=False, indent=2)\n", "\n", " # Save BPE merges as a list of dictionaries\n", " with open(bpe_merges_path, \"w\", encoding=\"utf-8\") as file:\n", @@ -667,6 +667,9 @@ " def find_freq_pair(token_ids, mode=\"most\"):\n", " pairs = Counter(zip(token_ids, token_ids[1:]))\n", "\n", + " if not pairs:\n", + " return None\n", + "\n", " if mode == \"most\":\n", " return max(pairs.items(), key=lambda x: x[1])[0]\n", " elif mode == \"least\":\n",