fix: model_tokenizer in openai text completion tokenization details (#5104)

* fix: model_tokenizer

* Update test

---------

Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
This commit is contained in:
Michael Feil 2023-06-22 14:23:19 +02:00 committed by GitHub
parent 6a5fbb7118
commit cfd703fa3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 3 additions and 2 deletions

View File

@ -65,6 +65,7 @@ def _openai_text_completion_tokenization_details(model_name: str):
"""
tokenizer_name = "gpt2"
max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3
model_tokenizer = None
if model_name == "gpt-35-turbo":
# covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72

View File

@ -16,8 +16,8 @@ from haystack.utils.openai_utils import (
@pytest.mark.unit
def test_openai_text_completion_tokenization_details_gpt_default():
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001")
assert tokenizer_name == "r50k_base"
tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="not-recognized-name")
assert tokenizer_name == "gpt2"
assert max_tokens_limit == 2049