From cfd703fa3efaca1d708aa6589ff363d12ff77a33 Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:23:19 +0200 Subject: [PATCH] fix: model_tokenizer in openai text completion tokenization details (#5104) * fix: model_tokenizer * Update test --------- Co-authored-by: Sebastian Husch Lee --- haystack/utils/openai_utils.py | 1 + test/utils/test_openai_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py index c733c0ae6..b866523eb 100644 --- a/haystack/utils/openai_utils.py +++ b/haystack/utils/openai_utils.py @@ -65,6 +65,7 @@ def _openai_text_completion_tokenization_details(model_name: str): """ tokenizer_name = "gpt2" max_tokens_limit = 2049 # Based on this ref: https://platform.openai.com/docs/models/gpt-3 + model_tokenizer = None if model_name == "gpt-35-turbo": # covering the lack of support in Tiktoken. https://github.com/openai/tiktoken/pull/72 diff --git a/test/utils/test_openai_utils.py b/test/utils/test_openai_utils.py index c9d49a747..4896d5aef 100644 --- a/test/utils/test_openai_utils.py +++ b/test/utils/test_openai_utils.py @@ -16,8 +16,8 @@ from haystack.utils.openai_utils import ( @pytest.mark.unit def test_openai_text_completion_tokenization_details_gpt_default(): - tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-ada-001") - assert tokenizer_name == "r50k_base" + tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="not-recognized-name") + assert tokenizer_name == "gpt2" assert max_tokens_limit == 2049