fix: Support for gpt-4-32k (#4825)

* Add step to loook up tokenizers by prefix in openai_utils

* Updated tiktoken min version + openai_utils test

* Added test case for GPT-4 and Azure model naming

* Broken down tests

* Added default case

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Farzad E 2023-05-12 10:02:12 -07:00 committed by GitHub
parent 179e9cea08
commit 6eb251d1f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 7 deletions

View File

@ -34,7 +34,7 @@ if sys.version_info >= (3, 8) and (machine in ["amd64", "x86_64"] or (machine ==
if USE_TIKTOKEN:
import tiktoken # pylint: disable=import-error
from tiktoken.model import MODEL_TO_ENCODING
from tiktoken.model import MODEL_TO_ENCODING, MODEL_PREFIX_TO_ENCODING
else:
logger.warning(
"OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast."
@ -97,11 +97,18 @@ def _openai_text_completion_tokenization_details(model_name: str):
"""
tokenizer_name = "gpt2"
max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3
model_tokenizer = MODEL_TO_ENCODING.get(model_name) if USE_TIKTOKEN else None
# covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72
if model_name == "gpt-35-turbo" and USE_TIKTOKEN:
model_tokenizer = "cl100k_base"
if USE_TIKTOKEN:
if model_name == "gpt-35-turbo":
# covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72
model_tokenizer = "cl100k_base"
elif model_name in MODEL_TO_ENCODING:
model_tokenizer = MODEL_TO_ENCODING[model_name]
else:
for model_prefix, tokenizer in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
model_tokenizer = tokenizer
break
if model_tokenizer:
# Based on OpenAI models page, 'davinci' considers have 2049 tokens,

View File

@ -76,7 +76,7 @@ dependencies = [
"sentence-transformers>=2.2.0",
# OpenAI tokenizer
"tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
"tiktoken>=0.3.2; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))",
# Schema validation
"jsonschema",

View File

@ -1,10 +1,53 @@
import pytest
from unittest.mock import patch
import pytest
from tenacity import wait_none
from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError
from haystack.utils.openai_utils import openai_request
from haystack.utils.openai_utils import openai_request, _openai_text_completion_tokenization_details
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_default():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001")
assert tokenizer_name == "r50k_base"
assert max_tokens_limit == 2049
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_davinci():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-davinci-003")
assert tokenizer_name == "p50k_base"
assert max_tokens_limit == 4097
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt3_5_azure():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-35-turbo")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 4096
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt3_5():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-3.5-turbo")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 4096
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_4():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 8192
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_4_32k():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-4-32k")
assert tokenizer_name == "cl100k_base"
assert max_tokens_limit == 32768
@pytest.mark.unit