ragflow/rag/llm/chat_model.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
from zhipuai import ZhipuAI
from dashscope import Generation
from abc import ABC
from openai import OpenAI
import openai
from ollama import Client
from rag.nlp import is_english
from rag.utils import num_tokens_from_string


class Base(ABC):
    def __init__(self, key, model_name):
        pass

    def chat(self, system, history, gen_conf):
        raise NotImplementedError("Please implement encode method!")


class GptTurbo(Base):
    def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
        if not base_url: base_url="https://api.openai.com/v1"
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name

    def chat(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=history,
                **gen_conf)
            ans = response.choices[0].message.content.strip()
            if response.choices[0].finish_reason == "length":
                ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                    [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
            return ans, response.usage.completion_tokens
        except openai.APIError as e:
            return "**ERROR**: " + str(e), 0


class MoonshotChat(GptTurbo):
    def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
        if not base_url: base_url="https://api.moonshot.cn/v1"
        self.client = OpenAI(
            api_key=key, base_url=base_url)
        self.model_name = model_name

    def chat(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=history,
                **gen_conf)
            ans = response.choices[0].message.content.strip()
            if response.choices[0].finish_reason == "length":
                ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                    [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
            return ans, response.usage.completion_tokens
        except openai.APIError as e:
            return "**ERROR**: " + str(e), 0


class QWenChat(Base):
    def __init__(self, key, model_name=Generation.Models.qwen_turbo, **kwargs):
        import dashscope
        dashscope.api_key = key
        self.model_name = model_name

    def chat(self, system, history, gen_conf):
        from http import HTTPStatus
        if system:
            history.insert(0, {"role": "system", "content": system})
        response = Generation.call(
            self.model_name,
            messages=history,
            result_format='message',
            **gen_conf
        )
        ans = ""
        tk_count = 0
        if response.status_code == HTTPStatus.OK:
            ans += response.output.choices[0]['message']['content']
            tk_count += response.usage.total_tokens
            if response.output.choices[0].get("finish_reason", "") == "length":
                ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                    [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
            return ans, tk_count

        return "**ERROR**: " + response.message, tk_count


class ZhipuChat(Base):
    def __init__(self, key, model_name="glm-3-turbo", **kwargs):
        self.client = ZhipuAI(api_key=key)
        self.model_name = model_name

    def chat(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=history,
                **gen_conf
            )
            ans = response.choices[0].message.content.strip()
            if response.choices[0].finish_reason == "length":
                ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                    [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
            return ans, response.usage.completion_tokens
        except Exception as e:
            return "**ERROR**: " + str(e), 0


class OllamaChat(Base):
    def __init__(self, key, model_name, **kwargs):
        self.client = Client(host=kwargs["base_url"])
        self.model_name = model_name

    def chat(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            options = {"temperature": gen_conf.get("temperature", 0.1),
                       "num_predict": gen_conf.get("max_tokens", 128),
                       "top_k": gen_conf.get("top_p", 0.3),
                       "presence_penalty": gen_conf.get("presence_penalty", 0.4),
                       "frequency_penalty": gen_conf.get("frequency_penalty", 0.7),
                       }
            response = self.client.chat(
                model=self.model_name,
                messages=history,
                options=options
            )
            ans = response["message"]["content"].strip()
            return ans, response["eval_count"]
        except Exception as e:
            return "**ERROR**: " + str(e), 0


class LocalLLM(Base):
    class RPCProxy:
        def __init__(self, host, port):
            self.host = host
            self.port = int(port)
            self.__conn()

        def __conn(self):
            from multiprocessing.connection import Client
            self._connection = Client(
                (self.host, self.port), authkey=b'infiniflow-token4kevinhu')

        def __getattr__(self, name):
            import pickle

            def do_rpc(*args, **kwargs):
                for _ in range(3):
                    try:
                        self._connection.send(
                            pickle.dumps((name, args, kwargs)))
                        return pickle.loads(self._connection.recv())
                    except Exception as e:
                        self.__conn()
                raise Exception("RPC connection lost!")

            return do_rpc

    def __init__(self, *args, **kwargs):
        self.client = LocalLLM.RPCProxy("127.0.0.1", 7860)

    def chat(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            ans = self.client.chat(
                history,
                gen_conf
            )
            return ans, num_tokens_from_string(ans)
        except Exception as e:
            return "**ERROR**: " + str(e), 0
build python version rag-flow (#21) * clean rust version project * clean rust version project * build python version rag-flow 2024-01-15 08:46:22 +08:00			`#`
llm configuation refine and trievalTest API refine (#40) 2024-01-19 19:51:57 +08:00			`# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.`
build python version rag-flow (#21) * clean rust version project * clean rust version project * build python version rag-flow 2024-01-15 08:46:22 +08:00			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`from zhipuai import ZhipuAI`
			`from dashscope import Generation`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`from abc import ABC`
add llm API (#19) * add llm API * refine llm API 2023-12-28 13:50:13 +08:00			`from openai import OpenAI`
refine admin initialization (#75) 2024-02-27 14:57:34 +08:00			`import openai`
Support Ollama (#261) ### What problem does this PR solve? Issue link:#221 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2024-04-08 19:20:57 +08:00			`from ollama import Client`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`from rag.nlp import is_english`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`from rag.utils import num_tokens_from_string`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00
add llm API (#19) * add llm API * refine llm API 2023-12-28 13:50:13 +08:00
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`class Base(ABC):`
Test APIs and fix bugs (#41) 2024-01-22 19:51:38 +08:00			`def __init__(self, key, model_name):`
			`pass`

use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`def chat(self, system, history, gen_conf):`
			`raise NotImplementedError("Please implement encode method!")`


			`class GptTurbo(Base):`
add base url for OpenAI (#166) 2024-03-28 19:15:16 +08:00			`def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):`
			`if not base_url: base_url="https://api.openai.com/v1"`
			`self.client = OpenAI(api_key=key, base_url=base_url)`
Test APIs and fix bugs (#41) 2024-01-22 19:51:38 +08:00			`self.model_name = model_name`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00
			`def chat(self, system, history, gen_conf):`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
refine admin initialization (#75) 2024-02-27 14:57:34 +08:00			`try:`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`response = self.client.chat.completions.create(`
refine admin initialization (#75) 2024-02-27 14:57:34 +08:00			`model=self.model_name,`
			`messages=history,`
			`**gen_conf)`
refine OpenAi Api (#159) 2024-03-27 17:55:45 +08:00			`ans = response.choices[0].message.content.strip()`
			`if response.choices[0].finish_reason == "length":`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`ans += "...\nFor the content length reason, it stopped, continue?" if is_english(`
			`[ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"`
			`return ans, response.usage.completion_tokens`
refine admin initialization (#75) 2024-02-27 14:57:34 +08:00			`except openai.APIError as e:`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`return "ERROR: " + str(e), 0`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00

add dockerfile for cuda envirement. Refine table search strategy, (#123) 2024-03-14 19:45:29 +08:00			`class MoonshotChat(GptTurbo):`
add base url for OpenAI (#166) 2024-03-28 19:15:16 +08:00			`def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):`
			`if not base_url: base_url="https://api.moonshot.cn/v1"`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`self.client = OpenAI(`
add base url for OpenAI (#166) 2024-03-28 19:15:16 +08:00			`api_key=key, base_url=base_url)`
add dockerfile for cuda envirement. Refine table search strategy, (#123) 2024-03-14 19:45:29 +08:00			`self.model_name = model_name`

add Moonshot, debug my_llm (#126) 2024-03-15 18:59:00 +08:00			`def chat(self, system, history, gen_conf):`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
add Moonshot, debug my_llm (#126) 2024-03-15 18:59:00 +08:00			`try:`
			`response = self.client.chat.completions.create(`
			`model=self.model_name,`
			`messages=history,`
			`**gen_conf)`
			`ans = response.choices[0].message.content.strip()`
			`if response.choices[0].finish_reason == "length":`
			`ans += "...\nFor the content length reason, it stopped, continue?" if is_english(`
			`[ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"`
			`return ans, response.usage.completion_tokens`
			`except openai.APIError as e:`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`return "ERROR: " + str(e), 0`
add Moonshot, debug my_llm (#126) 2024-03-15 18:59:00 +08:00
add dockerfile for cuda envirement. Refine table search strategy, (#123) 2024-03-14 19:45:29 +08:00
add llm API (#19) * add llm API * refine llm API 2023-12-28 13:50:13 +08:00			`class QWenChat(Base):`
add base url for OpenAI (#166) 2024-03-28 19:15:16 +08:00			`def __init__(self, key, model_name=Generation.Models.qwen_turbo, **kwargs):`
Test APIs and fix bugs (#41) 2024-01-22 19:51:38 +08:00			`import dashscope`
			`dashscope.api_key = key`
			`self.model_name = model_name`

use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`def chat(self, system, history, gen_conf):`
			`from http import HTTPStatus`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`response = Generation.call(`
Test APIs and fix bugs (#41) 2024-01-22 19:51:38 +08:00			`self.model_name,`
add llm API (#19) * add llm API * refine llm API 2023-12-28 13:50:13 +08:00			`messages=history,`
Refine resume parts and fix bugs in retrival using sql (#66) 2024-02-19 19:22:17 +08:00			`result_format='message',`
			`**gen_conf`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`)`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`ans = ""`
			`tk_count = 0`
use minio to store uploaded files; build dialog server; (#16) * format code * use minio to store uploaded files; build dialog server; 2023-12-25 19:05:59 +08:00			`if response.status_code == HTTPStatus.OK:`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`ans += response.output.choices[0]['message']['content']`
refine log format (#312) ### What problem does this PR solve? Issue link:#264 ### Type of change - [x] Documentation Update - [x] Refactoring 2024-04-11 10:13:43 +08:00			`tk_count += response.usage.total_tokens`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`if response.output.choices[0].get("finish_reason", "") == "length":`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`ans += "...\nFor the content length reason, it stopped, continue?" if is_english(`
			`[ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`return ans, tk_count`

			`return "ERROR: " + response.message, tk_count`
refactor retieval_test, add SQl retrieval methods (#61) 2024-02-08 17:01:01 +08:00

			`class ZhipuChat(Base):`
add base url for OpenAI (#166) 2024-03-28 19:15:16 +08:00			`def __init__(self, key, model_name="glm-3-turbo", **kwargs):`
refactor retieval_test, add SQl retrieval methods (#61) 2024-02-08 17:01:01 +08:00			`self.client = ZhipuAI(api_key=key)`
			`self.model_name = model_name`

			`def chat(self, system, history, gen_conf):`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`try:`
remove presence_penalty for chatglm (#268) ### What problem does this PR solve? Issue link:#265 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-04-09 09:24:08 +08:00			`if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]`
resolve issure to call ZH?IPUAI (#277) ### What problem does this PR solve? Issue link:#265 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-04-09 16:16:10 +08:00			`if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`response = self.client.chat.completions.create(`
refine OpenAi Api (#159) 2024-03-27 17:55:45 +08:00			`model=self.model_name,`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`messages=history,`
			`**gen_conf`
			`)`
refine OpenAi Api (#159) 2024-03-27 17:55:45 +08:00			`ans = response.choices[0].message.content.strip()`
			`if response.choices[0].finish_reason == "length":`
deal with stop reason being length problem (#109) 2024-03-07 16:12:01 +08:00			`ans += "...\nFor the content length reason, it stopped, continue?" if is_english(`
			`[ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"`
			`return ans, response.usage.completion_tokens`
			`except Exception as e:`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`return "ERROR: " + str(e), 0`

apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00
Support Ollama (#261) ### What problem does this PR solve? Issue link:#221 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2024-04-08 19:20:57 +08:00			`class OllamaChat(Base):`
			`def __init__(self, key, model_name, **kwargs):`
			`self.client = Client(host=kwargs["base_url"])`
			`self.model_name = model_name`

			`def chat(self, system, history, gen_conf):`
			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
			`try:`
			`options = {"temperature": gen_conf.get("temperature", 0.1),`
			`"num_predict": gen_conf.get("max_tokens", 128),`
			`"top_k": gen_conf.get("top_p", 0.3),`
			`"presence_penalty": gen_conf.get("presence_penalty", 0.4),`
			`"frequency_penalty": gen_conf.get("frequency_penalty", 0.7),`
			`}`
			`response = self.client.chat(`
			`model=self.model_name,`
			`messages=history,`
			`options=options`
			`)`
			`ans = response["message"]["content"].strip()`
			`return ans, response["eval_count"]`
			`except Exception as e:`
			`return "ERROR: " + str(e), 0`


add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`class LocalLLM(Base):`
			`class RPCProxy:`
			`def __init__(self, host, port):`
			`self.host = host`
			`self.port = int(port)`
			`self.__conn()`

			`def __conn(self):`
			`from multiprocessing.connection import Client`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`self._connection = Client(`
			`(self.host, self.port), authkey=b'infiniflow-token4kevinhu')`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00
			`def __getattr__(self, name):`
			`import pickle`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`def do_rpc(args, *kwargs):`
			`for _ in range(3):`
			`try:`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`self._connection.send(`
			`pickle.dumps((name, args, kwargs)))`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`return pickle.loads(self._connection.recv())`
			`except Exception as e:`
			`self.__conn()`
			`raise Exception("RPC connection lost!")`

			`return do_rpc`

refine error response, add set api-key MD (#178) 2024-03-31 19:09:42 +08:00			`def __init__(self, args, *kwargs):`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`self.client = LocalLLM.RPCProxy("127.0.0.1", 7860)`

			`def chat(self, system, history, gen_conf):`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if system:`
			`history.insert(0, {"role": "system", "content": system})`
add local llm implementation (#119) 2024-03-12 11:57:08 +08:00			`try:`
			`ans = self.client.chat(`
			`history,`
			`gen_conf`
			`)`
			`return ans, num_tokens_from_string(ans)`
			`except Exception as e:`
			`return "ERROR: " + str(e), 0`
fix docker compose issue (#238) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/226)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-04-07 09:04:32 +08:00