From 6eb251d1f09a2cbc38ceb41abc085dc19d71a660 Mon Sep 17 00:00:00 2001 From: Farzad E <372719+dnetguru@users.noreply.github.com> Date: Fri, 12 May 2023 10:02:12 -0700 Subject: [PATCH] fix: Support for gpt-4-32k (#4825) * Add step to loook up tokenizers by prefix in openai_utils * Updated tiktoken min version + openai_utils test * Added test case for GPT-4 and Azure model naming * Broken down tests * Added default case --------- Co-authored-by: ZanSara --- haystack/utils/openai_utils.py | 17 +++++++++---- pyproject.toml | 2 +- test/utils/test_openai_utils.py | 45 ++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py index ad6638f71..1bb51d37a 100644 --- a/haystack/utils/openai_utils.py +++ b/haystack/utils/openai_utils.py @@ -34,7 +34,7 @@ if sys.version_info >= (3, 8) and (machine in ["amd64", "x86_64"] or (machine == if USE_TIKTOKEN: import tiktoken # pylint: disable=import-error - from tiktoken.model import MODEL_TO_ENCODING + from tiktoken.model import MODEL_TO_ENCODING, MODEL_PREFIX_TO_ENCODING else: logger.warning( "OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast." @@ -97,11 +97,18 @@ def _openai_text_completion_tokenization_details(model_name: str): """ tokenizer_name = "gpt2" max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3 - model_tokenizer = MODEL_TO_ENCODING.get(model_name) if USE_TIKTOKEN else None - # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72 - if model_name == "gpt-35-turbo" and USE_TIKTOKEN: - model_tokenizer = "cl100k_base" + if USE_TIKTOKEN: + if model_name == "gpt-35-turbo": + # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72 + model_tokenizer = "cl100k_base" + elif model_name in MODEL_TO_ENCODING: + model_tokenizer = MODEL_TO_ENCODING[model_name] + else: + for model_prefix, tokenizer in MODEL_PREFIX_TO_ENCODING.items(): + if model_name.startswith(model_prefix): + model_tokenizer = tokenizer + break if model_tokenizer: # Based on OpenAI models page, 'davinci' considers have 2049 tokens, diff --git a/pyproject.toml b/pyproject.toml index 74b9667f7..9499ac9c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dependencies = [ "sentence-transformers>=2.2.0", # OpenAI tokenizer - "tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))", + "tiktoken>=0.3.2; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))", # Schema validation "jsonschema", diff --git a/test/utils/test_openai_utils.py b/test/utils/test_openai_utils.py index cc2594fbc..fe11acd2e 100644 --- a/test/utils/test_openai_utils.py +++ b/test/utils/test_openai_utils.py @@ -1,10 +1,53 @@ +import pytest from unittest.mock import patch import pytest from tenacity import wait_none from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError -from haystack.utils.openai_utils import openai_request +from haystack.utils.openai_utils import openai_request, _openai_text_completion_tokenization_details + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt_default(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001") + assert tokenizer_name == "r50k_base" + assert max_tokens_limit == 2049 + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt_davinci(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-davinci-003") + assert tokenizer_name == "p50k_base" + assert max_tokens_limit == 4097 + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt3_5_azure(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-35-turbo") + assert tokenizer_name == "cl100k_base" + assert max_tokens_limit == 4096 + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt3_5(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-3.5-turbo") + assert tokenizer_name == "cl100k_base" + assert max_tokens_limit == 4096 + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt_4(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4") + assert tokenizer_name == "cl100k_base" + assert max_tokens_limit == 8192 + + +@pytest.mark.unit +def test_openai_text_completion_tokenization_details_gpt_4_32k(): + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4-32k") + assert tokenizer_name == "cl100k_base" + assert max_tokens_limit == 32768 @pytest.mark.unit