From 6eb251d1f09a2cbc38ceb41abc085dc19d71a660 Mon Sep 17 00:00:00 2001
From: Farzad E <372719+dnetguru@users.noreply.github.com>
Date: Fri, 12 May 2023 10:02:12 -0700
Subject: [PATCH] fix: Support for gpt-4-32k (#4825)

* Add step to loook up tokenizers by prefix in openai_utils

* Updated tiktoken min version + openai_utils test

* Added test case for GPT-4 and Azure model naming

* Broken down tests

* Added default case

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
---
 haystack/utils/openai_utils.py  | 17 +++++++++----
 pyproject.toml                  |  2 +-
 test/utils/test_openai_utils.py | 45 ++++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py
index ad6638f71..1bb51d37a 100644
--- a/haystack/utils/openai_utils.py
+++ b/haystack/utils/openai_utils.py
@@ -34,7 +34,7 @@ if sys.version_info >= (3, 8) and (machine in ["amd64", "x86_64"] or (machine ==
 
 if USE_TIKTOKEN:
     import tiktoken  # pylint: disable=import-error
-    from tiktoken.model import MODEL_TO_ENCODING
+    from tiktoken.model import MODEL_TO_ENCODING, MODEL_PREFIX_TO_ENCODING
 else:
     logger.warning(
         "OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast."
@@ -97,11 +97,18 @@ def _openai_text_completion_tokenization_details(model_name: str):
     """
     tokenizer_name = "gpt2"
     max_tokens_limit = 2049  # Based on this ref: https://platform.openai.com/docs/models/gpt-3
-    model_tokenizer = MODEL_TO_ENCODING.get(model_name) if USE_TIKTOKEN else None
 
-    # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72
-    if model_name == "gpt-35-turbo" and USE_TIKTOKEN:
-        model_tokenizer = "cl100k_base"
+    if USE_TIKTOKEN:
+        if model_name == "gpt-35-turbo":
+            # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72
+            model_tokenizer = "cl100k_base"
+        elif model_name in MODEL_TO_ENCODING:
+            model_tokenizer = MODEL_TO_ENCODING[model_name]
+        else:
+            for model_prefix, tokenizer in MODEL_PREFIX_TO_ENCODING.items():
+                if model_name.startswith(model_prefix):
+                    model_tokenizer = tokenizer
+                    break
 
     if model_tokenizer:
         # Based on OpenAI models page, 'davinci' considers have 2049 tokens,
diff --git a/pyproject.toml b/pyproject.toml
index 74b9667f7..9499ac9c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ dependencies = [
   "sentence-transformers>=2.2.0",
 
   # OpenAI tokenizer
-  "tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
+  "tiktoken>=0.3.2; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
 
   # Schema validation
   "jsonschema",
diff --git a/test/utils/test_openai_utils.py b/test/utils/test_openai_utils.py
index cc2594fbc..fe11acd2e 100644
--- a/test/utils/test_openai_utils.py
+++ b/test/utils/test_openai_utils.py
@@ -1,10 +1,53 @@
+import pytest
 from unittest.mock import patch
 
 import pytest
 from tenacity import wait_none
 
 from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError
-from haystack.utils.openai_utils import openai_request
+from haystack.utils.openai_utils import openai_request, _openai_text_completion_tokenization_details
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt_default():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001")
+    assert tokenizer_name == "r50k_base"
+    assert max_tokens_limit == 2049
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt_davinci():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-davinci-003")
+    assert tokenizer_name == "p50k_base"
+    assert max_tokens_limit == 4097
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt3_5_azure():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-35-turbo")
+    assert tokenizer_name == "cl100k_base"
+    assert max_tokens_limit == 4096
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt3_5():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-3.5-turbo")
+    assert tokenizer_name == "cl100k_base"
+    assert max_tokens_limit == 4096
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt_4():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4")
+    assert tokenizer_name == "cl100k_base"
+    assert max_tokens_limit == 8192
+
+
+@pytest.mark.unit
+def test_openai_text_completion_tokenization_details_gpt_4_32k():
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4-32k")
+    assert tokenizer_name == "cl100k_base"
+    assert max_tokens_limit == 32768
 
 
 @pytest.mark.unit