[compatibility issue] Support open source LLM model to prompt-tune (#505)

Compatibility update: support non-open ai model to prompt-tune Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-12-07 12:31:01 +00:00 · 2024-07-12 02:03:30 +08:00 · 2024-07-12 02:03:30 +08:00 · c7da7f1afb
commit c7da7f1afb
parent 7a9c9071c1
2 changed files with 12 additions and 1 deletions
--- a/.semversioner/next-release/patch-20240711092703710242.json
+++ b/.semversioner/next-release/patch-20240711092703710242.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "support non-open ai model config to prompt tune"
+}
--- a/graphrag/index/utils/tokens.py
+++ b/graphrag/index/utils/tokens.py
@ -2,10 +2,12 @@
 # Licensed under the MIT License

 """Utilities for working with tokens."""
+import logging

 import tiktoken

 DEFAULT_ENCODING_NAME = "cl100k_base"
+log = logging.getLogger(__name__)


 def num_tokens_from_string(
@ -13,7 +15,12 @@ def num_tokens_from_string(
 ) -> int:
    """Return the number of tokens in a text string."""
    if model is not None:
-        encoding = tiktoken.encoding_for_model(model)
+        try:
+            encoding = tiktoken.encoding_for_model(model)
+        except KeyError as e:
+            log.error(f"Failed to get encoding for {model} when getting num_tokens_from_string, "
+                      f"fall back to default encoding {DEFAULT_ENCODING_NAME}")
+            encoding = tiktoken.get_encoding(DEFAULT_ENCODING_NAME)
    else:
        encoding = tiktoken.get_encoding(encoding_name or DEFAULT_ENCODING_NAME)
    return len(encoding.encode(string))