fix: Support for gpt-4-32k (#4825)

* Add step to loook up tokenizers by prefix in openai_utils

* Updated tiktoken min version + openai_utils test

* Added test case for GPT-4 and Azure model naming

* Broken down tests

* Added default case

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Farzad E 2023-05-12 10:02:12 -07:00 committed by GitHub
parent 179e9cea08
commit 6eb251d1f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 7 deletions

View File

@ -34,7 +34,7 @@ if sys.version_info >= (3, 8) and (machine in ["amd64", "x86_64"] or (machine ==
if USE_TIKTOKEN: if USE_TIKTOKEN:
import tiktoken # pylint: disable=import-error import tiktoken # pylint: disable=import-error
from tiktoken.model import MODEL_TO_ENCODING from tiktoken.model import MODEL_TO_ENCODING, MODEL_PREFIX_TO_ENCODING
else: else:
logger.warning( logger.warning(
"OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast." "OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast."
@ -97,11 +97,18 @@ def _openai_text_completion_tokenization_details(model_name: str):
""" """
tokenizer_name = "gpt2" tokenizer_name = "gpt2"
max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3 max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3
model_tokenizer = MODEL_TO_ENCODING.get(model_name) if USE_TIKTOKEN else None
# covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72 if USE_TIKTOKEN:
if model_name == "gpt-35-turbo" and USE_TIKTOKEN: if model_name == "gpt-35-turbo":
model_tokenizer = "cl100k_base" # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72
model_tokenizer = "cl100k_base"
elif model_name in MODEL_TO_ENCODING:
model_tokenizer = MODEL_TO_ENCODING[model_name]
else:
for model_prefix, tokenizer in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
model_tokenizer = tokenizer
break
if model_tokenizer: if model_tokenizer:
# Based on OpenAI models page, 'davinci' considers have 2049 tokens, # Based on OpenAI models page, 'davinci' considers have 2049 tokens,

View File

@ -76,7 +76,7 @@ dependencies = [
"sentence-transformers>=2.2.0", "sentence-transformers>=2.2.0",
# OpenAI tokenizer # OpenAI tokenizer
"tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))", "tiktoken>=0.3.2; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
# Schema validation # Schema validation
"jsonschema", "jsonschema",

View File

@ -1,10 +1,53 @@
import pytest
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
from tenacity import wait_none from tenacity import wait_none
from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError
from haystack.utils.openai_utils import openai_request from haystack.utils.openai_utils import openai_request, _openai_text_completion_tokenization_details
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_default():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001")
assert tokenizer_name == "r50k_base"
assert max_tokens_limit == 2049
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_davinci():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-davinci-003")
assert tokenizer_name == "p50k_base"
assert max_tokens_limit == 4097
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt3_5_azure():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-35-turbo")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 4096
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt3_5():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-3.5-turbo")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 4096
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_4():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 8192
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_4_32k():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4-32k")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 32768
@pytest.mark.unit @pytest.mark.unit