2024-01-15 08:46:22 +08:00
#
2024-01-19 19:51:57 +08:00
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
2024-01-15 08:46:22 +08:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
2025-03-18 14:52:20 +08:00
import base64
2024-02-23 18:28:12 +08:00
import io
2025-03-18 14:52:20 +08:00
import json
2023-12-28 13:50:13 +08:00
import os
2025-03-18 14:52:20 +08:00
from abc import ABC
2023-12-28 13:50:13 +08:00
from io import BytesIO
2025-06-03 14:18:40 +08:00
from urllib . parse import urljoin
2025-03-18 14:52:20 +08:00
2024-07-16 15:19:43 +08:00
import requests
2025-03-18 14:52:20 +08:00
from ollama import Client
from openai import OpenAI
from openai . lib . azure import AzureOpenAI
from PIL import Image
from zhipuai import ZhipuAI
2023-12-28 13:50:13 +08:00
2024-02-23 18:28:12 +08:00
from api . utils import get_uuid
from api . utils . file_utils import get_project_base_directory
2025-03-18 14:52:20 +08:00
from rag . nlp import is_english
from rag . prompts import vision_llm_describe_prompt
2025-03-24 12:34:57 +08:00
from rag . utils import num_tokens_from_string
2024-02-23 18:28:12 +08:00
2023-12-28 13:50:13 +08:00
class Base ( ABC ) :
2024-01-15 08:46:22 +08:00
def __init__ ( self , key , model_name ) :
pass
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
raise NotImplementedError ( " Please implement encode method! " )
def describe_with_prompt ( self , image , prompt = None ) :
2023-12-28 13:50:13 +08:00
raise NotImplementedError ( " Please implement encode method! " )
2025-03-18 14:52:20 +08:00
2024-07-19 18:36:34 +08:00
def chat ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
try :
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
response = self . client . chat . completions . create (
model = self . model_name ,
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
2025-07-03 19:05:31 +08:00
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
2024-07-19 18:36:34 +08:00
)
return response . choices [ 0 ] . message . content . strip ( ) , response . usage . total_tokens
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
def chat_streamly ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
ans = " "
tk_count = 0
try :
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
2023-12-28 13:50:13 +08:00
2024-07-19 18:36:34 +08:00
response = self . client . chat . completions . create (
model = self . model_name ,
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
2025-07-03 19:05:31 +08:00
stream = True ,
2024-07-19 18:36:34 +08:00
)
for resp in response :
2024-12-08 14:21:12 +08:00
if not resp . choices [ 0 ] . delta . content :
continue
2024-07-19 18:36:34 +08:00
delta = resp . choices [ 0 ] . delta . content
ans + = delta
if resp . choices [ 0 ] . finish_reason == " length " :
2025-07-03 19:05:31 +08:00
ans + = " ... \n For the content length reason, it stopped, continue? " if is_english ( [ ans ] ) else " ······ \n 由于长度的原因,回答被截断了,要继续吗? "
2024-07-19 18:36:34 +08:00
tk_count = resp . usage . total_tokens
2024-12-08 14:21:12 +08:00
if resp . choices [ 0 ] . finish_reason == " stop " :
tk_count = resp . usage . total_tokens
2024-07-19 18:36:34 +08:00
yield ans
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
yield tk_count
2025-03-18 14:52:20 +08:00
2023-12-28 13:50:13 +08:00
def image2base64 ( self , image ) :
2024-01-22 19:51:38 +08:00
if isinstance ( image , bytes ) :
return base64 . b64encode ( image ) . decode ( " utf-8 " )
2023-12-28 13:50:13 +08:00
if isinstance ( image , BytesIO ) :
return base64 . b64encode ( image . getvalue ( ) ) . decode ( " utf-8 " )
buffered = BytesIO ( )
try :
image . save ( buffered , format = " JPEG " )
2024-12-08 14:21:12 +08:00
except Exception :
2023-12-28 13:50:13 +08:00
image . save ( buffered , format = " PNG " )
return base64 . b64encode ( buffered . getvalue ( ) ) . decode ( " utf-8 " )
def prompt ( self , b64 ) :
return [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
2025-07-03 19:05:31 +08:00
" image_url " : { " url " : f " data:image/jpeg;base64, { b64 } " } ,
2023-12-28 13:50:13 +08:00
} ,
2024-02-23 18:28:12 +08:00
{
2025-07-03 19:05:31 +08:00
" text " : " 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. " ,
2024-02-23 18:28:12 +08:00
} ,
2023-12-28 13:50:13 +08:00
] ,
}
]
2025-03-18 14:52:20 +08:00
def vision_llm_prompt ( self , b64 , prompt = None ) :
return [
{
" role " : " user " ,
" content " : [
{
" type " : " image_url " ,
2025-07-03 19:05:31 +08:00
" image_url " : { " url " : f " data:image/jpeg;base64, { b64 } " } ,
2025-03-18 14:52:20 +08:00
} ,
{
" type " : " text " ,
" text " : prompt if prompt else vision_llm_describe_prompt ( ) ,
} ,
] ,
}
]
2024-07-19 18:36:34 +08:00
def chat_prompt ( self , text , b64 ) :
return [
{
" type " : " image_url " ,
" image_url " : {
" url " : f " data:image/jpeg;base64, { b64 } " ,
} ,
} ,
2025-07-03 19:05:31 +08:00
{ " type " : " text " , " text " : text } ,
2024-07-19 18:36:34 +08:00
]
2023-12-28 13:50:13 +08:00
class GptV4 ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " OpenAI "
2024-03-28 19:15:16 +08:00
def __init__ ( self , key , model_name = " gpt-4-vision-preview " , lang = " Chinese " , base_url = " https://api.openai.com/v1 " ) :
2024-12-08 14:21:12 +08:00
if not base_url :
2025-03-18 14:52:20 +08:00
base_url = " https://api.openai.com/v1 "
2024-03-28 19:15:16 +08:00
self . client = OpenAI ( api_key = key , base_url = base_url )
2024-01-15 08:46:22 +08:00
self . model_name = model_name
2024-02-23 18:28:12 +08:00
self . lang = lang
2023-12-28 13:50:13 +08:00
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2023-12-28 13:50:13 +08:00
b64 = self . image2base64 ( image )
2024-06-12 14:21:35 +08:00
prompt = self . prompt ( b64 )
for i in range ( len ( prompt ) ) :
for c in prompt [ i ] [ " content " ] :
2024-12-08 14:21:12 +08:00
if " text " in c :
c [ " type " ] = " text "
2023-12-28 13:50:13 +08:00
res = self . client . chat . completions . create (
2024-01-15 08:46:22 +08:00
model = self . model_name ,
2025-07-03 19:05:31 +08:00
messages = prompt ,
2023-12-28 13:50:13 +08:00
)
2024-01-23 19:45:36 +08:00
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2023-12-28 13:50:13 +08:00
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
res = self . client . chat . completions . create (
model = self . model_name ,
messages = vision_prompt ,
)
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2025-02-27 14:06:49 +08:00
2024-07-04 09:57:16 +08:00
class AzureGptV4 ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Azure-OpenAI "
2024-07-04 09:57:16 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , * * kwargs ) :
2025-07-03 19:05:31 +08:00
api_key = json . loads ( key ) . get ( " api_key " , " " )
api_version = json . loads ( key ) . get ( " api_version " , " 2024-02-01 " )
2024-10-11 11:26:42 +08:00
self . client = AzureOpenAI ( api_key = api_key , azure_endpoint = kwargs [ " base_url " ] , api_version = api_version )
2024-07-04 09:57:16 +08:00
self . model_name = model_name
self . lang = lang
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-07-04 09:57:16 +08:00
b64 = self . image2base64 ( image )
prompt = self . prompt ( b64 )
for i in range ( len ( prompt ) ) :
for c in prompt [ i ] [ " content " ] :
2024-12-08 14:21:12 +08:00
if " text " in c :
c [ " type " ] = " text "
2024-07-04 09:57:16 +08:00
2025-07-03 19:05:31 +08:00
res = self . client . chat . completions . create ( model = self . model_name , messages = prompt )
2024-07-04 09:57:16 +08:00
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2023-12-28 13:50:13 +08:00
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
res = self . client . chat . completions . create (
model = self . model_name ,
messages = vision_prompt ,
)
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2024-07-04 10:33:49 +08:00
2025-07-11 10:35:23 +08:00
class xAICV ( Base ) :
_FACTORY_NAME = " xAI "
def __init__ ( self , key , model_name = " grok-3 " , base_url = None , * * kwargs ) :
if not base_url :
base_url = " https://api.x.ai/v1 "
super ( ) . __init__ ( key , model_name , base_url = base_url , * * kwargs )
return
2023-12-28 13:50:13 +08:00
class QWenCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Tongyi-Qianwen "
2024-03-28 19:15:16 +08:00
def __init__ ( self , key , model_name = " qwen-vl-chat-v1 " , lang = " Chinese " , * * kwargs ) :
2024-01-15 08:46:22 +08:00
import dashscope
2025-07-03 19:05:31 +08:00
2024-01-15 08:46:22 +08:00
dashscope . api_key = key
self . model_name = model_name
2024-02-23 18:28:12 +08:00
self . lang = lang
def prompt ( self , binary ) :
# stupid as hell
tmp_dir = get_project_base_directory ( " tmp " )
2024-03-27 11:33:46 +08:00
if not os . path . exists ( tmp_dir ) :
2025-07-02 18:35:16 +08:00
os . makedirs ( tmp_dir , exist_ok = True )
2024-03-27 11:33:46 +08:00
path = os . path . join ( tmp_dir , " %s .jpg " % get_uuid ( ) )
2024-02-23 18:28:12 +08:00
Image . open ( io . BytesIO ( binary ) ) . save ( path )
return [
{
" role " : " user " ,
" content " : [
2025-07-03 19:05:31 +08:00
{ " image " : f " file:// { path } " } ,
2024-02-23 18:28:12 +08:00
{
2025-07-03 19:05:31 +08:00
" text " : " 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. " ,
2024-02-23 18:28:12 +08:00
} ,
] ,
}
]
2024-01-15 08:46:22 +08:00
2025-03-18 14:52:20 +08:00
def vision_llm_prompt ( self , binary , prompt = None ) :
# stupid as hell
tmp_dir = get_project_base_directory ( " tmp " )
if not os . path . exists ( tmp_dir ) :
2025-07-02 18:35:16 +08:00
os . makedirs ( tmp_dir , exist_ok = True )
2025-03-18 14:52:20 +08:00
path = os . path . join ( tmp_dir , " %s .jpg " % get_uuid ( ) )
Image . open ( io . BytesIO ( binary ) ) . save ( path )
return [
{
" role " : " user " ,
" content " : [
2025-07-03 19:05:31 +08:00
{ " image " : f " file:// { path } " } ,
2025-03-18 14:52:20 +08:00
{
2025-07-03 19:05:31 +08:00
" text " : prompt if prompt else vision_llm_describe_prompt ( ) ,
2025-03-18 14:52:20 +08:00
} ,
] ,
}
]
2024-07-19 18:36:34 +08:00
def chat_prompt ( self , text , b64 ) :
return [
{ " image " : f " { b64 } " } ,
{ " text " : text } ,
]
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
from http import HTTPStatus
from dashscope import MultiModalConversation
2025-07-03 19:05:31 +08:00
2025-03-18 14:52:20 +08:00
response = MultiModalConversation . call ( model = self . model_name , messages = self . prompt ( image ) )
if response . status_code == HTTPStatus . OK :
2025-07-03 19:05:31 +08:00
return response . output . choices [ 0 ] [ " message " ] [ " content " ] [ 0 ] [ " text " ] , response . usage . output_tokens
2025-03-18 14:52:20 +08:00
return response . message , 0
def describe_with_prompt ( self , image , prompt = None ) :
2023-12-28 13:50:13 +08:00
from http import HTTPStatus
2025-03-18 14:52:20 +08:00
2023-12-28 13:50:13 +08:00
from dashscope import MultiModalConversation
2025-03-18 14:52:20 +08:00
vision_prompt = self . vision_llm_prompt ( image , prompt ) if prompt else self . vision_llm_prompt ( image )
response = MultiModalConversation . call ( model = self . model_name , messages = vision_prompt )
2023-12-28 13:50:13 +08:00
if response . status_code == HTTPStatus . OK :
2025-07-03 19:05:31 +08:00
return response . output . choices [ 0 ] [ " message " ] [ " content " ] [ 0 ] [ " text " ] , response . usage . output_tokens
2024-01-23 19:45:36 +08:00
return response . message , 0
2024-02-08 17:01:01 +08:00
2024-07-19 18:36:34 +08:00
def chat ( self , system , history , gen_conf , image = " " ) :
from http import HTTPStatus
2025-03-18 14:52:20 +08:00
2024-07-19 18:36:34 +08:00
from dashscope import MultiModalConversation
2025-07-03 19:05:31 +08:00
2024-07-19 18:36:34 +08:00
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
2025-07-03 19:05:31 +08:00
response = MultiModalConversation . call (
model = self . model_name ,
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
)
2024-07-19 18:36:34 +08:00
ans = " "
tk_count = 0
if response . status_code == HTTPStatus . OK :
2025-07-03 19:05:31 +08:00
ans = response . output . choices [ 0 ] [ " message " ] [ " content " ]
2025-06-06 17:55:13 +08:00
if isinstance ( ans , list ) :
ans = ans [ 0 ] [ " text " ] if ans else " "
2024-07-19 18:36:34 +08:00
tk_count + = response . usage . total_tokens
if response . output . choices [ 0 ] . get ( " finish_reason " , " " ) == " length " :
2025-07-03 19:05:31 +08:00
ans + = " ... \n For the content length reason, it stopped, continue? " if is_english ( [ ans ] ) else " ······ \n 由于长度的原因,回答被截断了,要继续吗? "
2024-07-19 18:36:34 +08:00
return ans , tk_count
return " **ERROR**: " + response . message , tk_count
def chat_streamly ( self , system , history , gen_conf , image = " " ) :
from http import HTTPStatus
2025-03-18 14:52:20 +08:00
2024-07-19 18:36:34 +08:00
from dashscope import MultiModalConversation
2025-07-03 19:05:31 +08:00
2024-07-19 18:36:34 +08:00
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
ans = " "
tk_count = 0
try :
2025-07-03 19:05:31 +08:00
response = MultiModalConversation . call (
model = self . model_name ,
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
stream = True ,
)
2024-07-19 18:36:34 +08:00
for resp in response :
if resp . status_code == HTTPStatus . OK :
2025-07-03 19:05:31 +08:00
cnt = resp . output . choices [ 0 ] [ " message " ] [ " content " ]
2025-06-06 17:55:13 +08:00
if isinstance ( cnt , list ) :
cnt = cnt [ 0 ] [ " text " ] if ans else " "
ans + = cnt
2024-07-19 18:36:34 +08:00
tk_count = resp . usage . total_tokens
if resp . output . choices [ 0 ] . get ( " finish_reason " , " " ) == " length " :
2025-07-03 19:05:31 +08:00
ans + = " ... \n For the content length reason, it stopped, continue? " if is_english ( [ ans ] ) else " ······ \n 由于长度的原因,回答被截断了,要继续吗? "
2024-07-19 18:36:34 +08:00
yield ans
else :
2025-07-03 19:05:31 +08:00
yield ans + " \n **ERROR**: " + resp . message if str ( resp . message ) . find ( " Access " ) < 0 else " Out of credit. Please set the API key in **settings > Model providers.** "
2024-07-19 18:36:34 +08:00
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
yield tk_count
2024-02-08 17:01:01 +08:00
class Zhipu4V ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " ZHIPU-AI "
2024-03-28 19:15:16 +08:00
def __init__ ( self , key , model_name = " glm-4v " , lang = " Chinese " , * * kwargs ) :
2024-02-08 17:01:01 +08:00
self . client = ZhipuAI ( api_key = key )
self . model_name = model_name
2024-02-23 18:28:12 +08:00
self . lang = lang
2024-02-08 17:01:01 +08:00
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-02-08 17:01:01 +08:00
b64 = self . image2base64 ( image )
2024-09-03 15:11:53 +08:00
prompt = self . prompt ( b64 )
prompt [ 0 ] [ " content " ] [ 1 ] [ " type " ] = " text "
2025-03-18 14:52:20 +08:00
2024-02-08 17:01:01 +08:00
res = self . client . chat . completions . create (
model = self . model_name ,
2025-03-18 14:52:20 +08:00
messages = prompt ,
)
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
2025-07-03 19:05:31 +08:00
res = self . client . chat . completions . create ( model = self . model_name , messages = vision_prompt )
2024-02-08 17:01:01 +08:00
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2024-03-12 11:57:08 +08:00
2024-07-19 18:36:34 +08:00
def chat ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
try :
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
response = self . client . chat . completions . create (
model = self . model_name ,
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
2025-07-03 19:05:31 +08:00
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
2024-07-19 18:36:34 +08:00
)
return response . choices [ 0 ] . message . content . strip ( ) , response . usage . total_tokens
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
def chat_streamly ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
ans = " "
tk_count = 0
try :
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = self . chat_prompt ( his [ " content " ] , image )
response = self . client . chat . completions . create (
2025-03-18 14:52:20 +08:00
model = self . model_name ,
2024-07-19 18:36:34 +08:00
messages = history ,
temperature = gen_conf . get ( " temperature " , 0.3 ) ,
top_p = gen_conf . get ( " top_p " , 0.7 ) ,
2025-07-03 19:05:31 +08:00
stream = True ,
2024-07-19 18:36:34 +08:00
)
for resp in response :
2024-12-08 14:21:12 +08:00
if not resp . choices [ 0 ] . delta . content :
continue
2024-07-19 18:36:34 +08:00
delta = resp . choices [ 0 ] . delta . content
ans + = delta
if resp . choices [ 0 ] . finish_reason == " length " :
2025-07-03 19:05:31 +08:00
ans + = " ... \n For the content length reason, it stopped, continue? " if is_english ( [ ans ] ) else " ······ \n 由于长度的原因,回答被截断了,要继续吗? "
2024-07-19 18:36:34 +08:00
tk_count = resp . usage . total_tokens
2024-12-08 14:21:12 +08:00
if resp . choices [ 0 ] . finish_reason == " stop " :
tk_count = resp . usage . total_tokens
2024-07-19 18:36:34 +08:00
yield ans
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
yield tk_count
2024-03-12 11:57:08 +08:00
2024-04-08 19:20:57 +08:00
class OllamaCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Ollama "
2024-04-08 19:20:57 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , * * kwargs ) :
self . client = Client ( host = kwargs [ " base_url " ] )
self . model_name = model_name
self . lang = lang
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-04-08 19:20:57 +08:00
prompt = self . prompt ( " " )
try :
response = self . client . generate (
model = self . model_name ,
prompt = prompt [ 0 ] [ " content " ] [ 1 ] [ " text " ] ,
2025-07-03 19:05:31 +08:00
images = [ image ] ,
2024-04-08 19:20:57 +08:00
)
ans = response [ " response " ] . strip ( )
return ans , 128
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
vision_prompt = self . vision_llm_prompt ( " " , prompt ) if prompt else self . vision_llm_prompt ( " " )
try :
response = self . client . generate (
model = self . model_name ,
prompt = vision_prompt [ 0 ] [ " content " ] [ 1 ] [ " text " ] ,
images = [ image ] ,
)
ans = response [ " response " ] . strip ( )
return ans , 128
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
2024-07-19 18:36:34 +08:00
def chat ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
2024-07-25 10:23:35 +08:00
2024-07-19 18:36:34 +08:00
try :
for his in history :
if his [ " role " ] == " user " :
his [ " images " ] = [ image ]
options = { }
2024-12-08 14:21:12 +08:00
if " temperature " in gen_conf :
options [ " temperature " ] = gen_conf [ " temperature " ]
if " top_p " in gen_conf :
options [ " top_k " ] = gen_conf [ " top_p " ]
if " presence_penalty " in gen_conf :
options [ " presence_penalty " ] = gen_conf [ " presence_penalty " ]
if " frequency_penalty " in gen_conf :
options [ " frequency_penalty " ] = gen_conf [ " frequency_penalty " ]
2024-07-19 18:36:34 +08:00
response = self . client . chat (
model = self . model_name ,
messages = history ,
2025-06-12 15:09:40 +08:00
options = options ,
2025-07-03 19:05:31 +08:00
keep_alive = - 1 ,
2024-07-19 18:36:34 +08:00
)
ans = response [ " message " ] [ " content " ] . strip ( )
return ans , response [ " eval_count " ] + response . get ( " prompt_eval_count " , 0 )
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
def chat_streamly ( self , system , history , gen_conf , image = " " ) :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
for his in history :
if his [ " role " ] == " user " :
his [ " images " ] = [ image ]
options = { }
2024-12-08 14:21:12 +08:00
if " temperature " in gen_conf :
options [ " temperature " ] = gen_conf [ " temperature " ]
if " top_p " in gen_conf :
options [ " top_k " ] = gen_conf [ " top_p " ]
if " presence_penalty " in gen_conf :
options [ " presence_penalty " ] = gen_conf [ " presence_penalty " ]
if " frequency_penalty " in gen_conf :
options [ " frequency_penalty " ] = gen_conf [ " frequency_penalty " ]
2024-07-19 18:36:34 +08:00
ans = " "
try :
response = self . client . chat (
model = self . model_name ,
messages = history ,
stream = True ,
2025-06-12 15:09:40 +08:00
options = options ,
2025-07-03 19:05:31 +08:00
keep_alive = - 1 ,
2024-07-19 18:36:34 +08:00
)
for resp in response :
if resp [ " done " ] :
yield resp . get ( " prompt_eval_count " , 0 ) + resp . get ( " eval_count " , 0 )
ans + = resp [ " message " ] [ " content " ]
yield ans
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
yield 0
2024-04-11 18:25:37 +08:00
2024-07-25 10:23:35 +08:00
class LocalAICV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " LocalAI "
2024-07-19 15:50:28 +08:00
def __init__ ( self , key , model_name , base_url , lang = " Chinese " ) :
2024-07-25 10:23:35 +08:00
if not base_url :
raise ValueError ( " Local cv model url cannot be None " )
2025-06-03 14:18:40 +08:00
base_url = urljoin ( base_url , " v1 " )
2024-07-19 15:50:28 +08:00
self . client = OpenAI ( api_key = " empty " , base_url = base_url )
self . model_name = model_name . split ( " ___ " ) [ 0 ]
self . lang = lang
2024-04-11 18:22:25 +08:00
class XinferenceCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Xinference "
2024-04-11 18:22:25 +08:00
def __init__ ( self , key , model_name = " " , lang = " Chinese " , base_url = " " ) :
2025-06-03 14:18:40 +08:00
base_url = urljoin ( base_url , " v1 " )
2024-10-16 10:21:08 +08:00
self . client = OpenAI ( api_key = key , base_url = base_url )
2024-04-11 18:22:25 +08:00
self . model_name = model_name
self . lang = lang
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-04-11 18:22:25 +08:00
b64 = self . image2base64 ( image )
2025-07-03 19:05:31 +08:00
res = self . client . chat . completions . create ( model = self . model_name , messages = self . prompt ( b64 ) )
2024-04-11 18:22:25 +08:00
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
res = self . client . chat . completions . create (
model = self . model_name ,
messages = vision_prompt ,
)
return res . choices [ 0 ] . message . content . strip ( ) , res . usage . total_tokens
2024-07-11 15:41:00 +08:00
class GeminiCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Gemini "
2024-07-11 15:41:00 +08:00
def __init__ ( self , key , model_name = " gemini-1.0-pro-vision-latest " , lang = " Chinese " , * * kwargs ) :
2025-03-18 14:52:20 +08:00
from google . generativeai import GenerativeModel , client
2025-07-03 19:05:31 +08:00
2024-07-11 15:41:00 +08:00
client . configure ( api_key = key )
_client = client . get_default_generative_client ( )
self . model_name = model_name
self . model = GenerativeModel ( model_name = self . model_name )
self . model . _client = _client
2025-03-18 14:52:20 +08:00
self . lang = lang
2024-07-16 15:19:43 +08:00
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-07-11 15:41:00 +08:00
from PIL . Image import open
2025-07-03 19:05:31 +08:00
prompt = (
" 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. "
)
2025-03-18 14:52:20 +08:00
b64 = self . image2base64 ( image )
img = open ( BytesIO ( base64 . b64decode ( b64 ) ) )
input = [ prompt , img ]
2025-07-03 19:05:31 +08:00
res = self . model . generate_content ( input )
2025-03-18 14:52:20 +08:00
return res . text , res . usage_metadata . total_token_count
def describe_with_prompt ( self , image , prompt = None ) :
from PIL . Image import open
2025-07-03 19:05:31 +08:00
2025-03-18 14:52:20 +08:00
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
img = open ( BytesIO ( base64 . b64decode ( b64 ) ) )
input = [ vision_prompt , img ]
res = self . model . generate_content (
input ,
)
return res . text , res . usage_metadata . total_token_count
2024-04-08 19:20:57 +08:00
2024-07-19 18:36:34 +08:00
def chat ( self , system , history , gen_conf , image = " " ) :
2024-12-09 14:21:37 +08:00
from transformers import GenerationConfig
2025-07-03 19:05:31 +08:00
2024-07-19 18:36:34 +08:00
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
try :
for his in history :
if his [ " role " ] == " assistant " :
his [ " role " ] = " model "
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
if his [ " role " ] == " user " :
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
2024-12-08 14:21:12 +08:00
history [ - 1 ] [ " parts " ] . append ( " data:image/jpeg;base64, " + image )
2024-07-19 18:36:34 +08:00
2025-07-03 19:05:31 +08:00
response = self . model . generate_content ( history , generation_config = GenerationConfig ( temperature = gen_conf . get ( " temperature " , 0.3 ) , top_p = gen_conf . get ( " top_p " , 0.7 ) ) )
2024-07-19 18:36:34 +08:00
ans = response . text
return ans , response . usage_metadata . total_token_count
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
def chat_streamly ( self , system , history , gen_conf , image = " " ) :
2024-12-09 14:21:37 +08:00
from transformers import GenerationConfig
2025-07-03 19:05:31 +08:00
2024-07-19 18:36:34 +08:00
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
ans = " "
try :
for his in history :
if his [ " role " ] == " assistant " :
his [ " role " ] = " model "
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
if his [ " role " ] == " user " :
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
2024-12-08 14:21:12 +08:00
history [ - 1 ] [ " parts " ] . append ( " data:image/jpeg;base64, " + image )
2024-07-19 18:36:34 +08:00
2025-07-03 19:05:31 +08:00
response = self . model . generate_content (
history ,
generation_config = GenerationConfig ( temperature = gen_conf . get ( " temperature " , 0.3 ) , top_p = gen_conf . get ( " top_p " , 0.7 ) ) ,
stream = True ,
)
2024-07-19 18:36:34 +08:00
for resp in response :
2024-12-08 14:21:12 +08:00
if not resp . text :
continue
2024-07-19 18:36:34 +08:00
ans + = resp . text
yield ans
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
yield response . _chunks [ - 1 ] . usage_metadata . total_token_count
2024-07-16 15:19:43 +08:00
2024-07-25 10:23:35 +08:00
class OpenRouterCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " OpenRouter "
2024-07-16 15:19:43 +08:00
def __init__ (
self ,
key ,
model_name ,
lang = " Chinese " ,
2024-07-25 10:23:35 +08:00
base_url = " https://openrouter.ai/api/v1 " ,
2024-07-16 15:19:43 +08:00
) :
2024-07-25 10:23:35 +08:00
if not base_url :
base_url = " https://openrouter.ai/api/v1 "
self . client = OpenAI ( api_key = key , base_url = base_url )
2024-07-16 15:19:43 +08:00
self . model_name = model_name
self . lang = lang
2024-03-12 11:57:08 +08:00
class LocalCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Moonshot "
2024-03-28 19:15:16 +08:00
def __init__ ( self , key , model_name = " glm-4v " , lang = " Chinese " , * * kwargs ) :
2024-03-12 11:57:08 +08:00
pass
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-03-12 11:57:08 +08:00
return " " , 0
2024-07-23 10:43:09 +08:00
class NvidiaCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " NVIDIA "
2024-07-23 10:43:09 +08:00
def __init__ (
self ,
key ,
model_name ,
lang = " Chinese " ,
base_url = " https://ai.api.nvidia.com/v1/vlm " ,
) :
if not base_url :
base_url = ( " https://ai.api.nvidia.com/v1/vlm " , )
self . lang = lang
factory , llm_name = model_name . split ( " / " )
if factory != " liuhaotian " :
2025-06-03 14:18:40 +08:00
self . base_url = urljoin ( base_url , f " { factory } / { llm_name } " )
2024-07-23 10:43:09 +08:00
else :
2025-06-03 14:18:40 +08:00
self . base_url = urljoin ( f " { base_url } /community " , llm_name . replace ( " -v1.6 " , " 16 " ) )
2024-07-23 10:43:09 +08:00
self . key = key
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-07-23 10:43:09 +08:00
b64 = self . image2base64 ( image )
response = requests . post (
url = self . base_url ,
headers = {
" accept " : " application/json " ,
" content-type " : " application/json " ,
" Authorization " : f " Bearer { self . key } " ,
} ,
2025-07-03 19:05:31 +08:00
json = { " messages " : self . prompt ( b64 ) } ,
2024-07-23 10:43:09 +08:00
)
response = response . json ( )
return (
response [ " choices " ] [ 0 ] [ " message " ] [ " content " ] . strip ( ) ,
response [ " usage " ] [ " total_tokens " ] ,
)
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
response = requests . post (
url = self . base_url ,
headers = {
" accept " : " application/json " ,
" content-type " : " application/json " ,
" Authorization " : f " Bearer { self . key } " ,
} ,
json = {
" messages " : vision_prompt ,
} ,
)
response = response . json ( )
return (
response [ " choices " ] [ 0 ] [ " message " ] [ " content " ] . strip ( ) ,
response [ " usage " ] [ " total_tokens " ] ,
)
2024-07-23 10:43:09 +08:00
def prompt ( self , b64 ) :
return [
{
" role " : " user " ,
" content " : (
" 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. "
)
+ f ' <img src= " data:image/jpeg;base64, { b64 } " /> ' ,
}
]
2025-03-18 14:52:20 +08:00
def vision_llm_prompt ( self , b64 , prompt = None ) :
return [
{
" role " : " user " ,
2025-07-03 19:05:31 +08:00
" content " : ( prompt if prompt else vision_llm_describe_prompt ( ) ) + f ' <img src= " data:image/jpeg;base64, { b64 } " /> ' ,
2025-03-18 14:52:20 +08:00
}
]
2024-07-23 10:43:09 +08:00
def chat_prompt ( self , text , b64 ) :
return [
{
" role " : " user " ,
" content " : text + f ' <img src= " data:image/jpeg;base64, { b64 } " /> ' ,
}
]
2024-07-24 12:46:43 +08:00
2024-08-15 10:02:36 +08:00
2024-07-31 15:30:47 +08:00
class StepFunCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " StepFun "
2024-07-30 16:57:27 +08:00
def __init__ ( self , key , model_name = " step-1v-8k " , lang = " Chinese " , base_url = " https://api.stepfun.com/v1 " ) :
2024-12-08 14:21:12 +08:00
if not base_url :
2025-03-18 14:52:20 +08:00
base_url = " https://api.stepfun.com/v1 "
2024-07-30 16:57:27 +08:00
self . client = OpenAI ( api_key = key , base_url = base_url )
self . model_name = model_name
self . lang = lang
2024-08-15 10:02:36 +08:00
2024-07-25 10:23:35 +08:00
class LmStudioCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " LM-Studio "
2024-08-15 10:02:36 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = " " ) :
2024-07-24 12:46:43 +08:00
if not base_url :
raise ValueError ( " Local llm url cannot be None " )
2025-06-03 14:18:40 +08:00
base_url = urljoin ( base_url , " v1 " )
2024-07-25 10:23:35 +08:00
self . client = OpenAI ( api_key = " lm-studio " , base_url = base_url )
2024-07-24 12:46:43 +08:00
self . model_name = model_name
self . lang = lang
2024-08-06 16:20:21 +08:00
class OpenAI_APICV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = [ " VLLM " , " OpenAI-API-Compatible " ]
2024-08-15 10:02:36 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = " " ) :
2024-08-06 16:20:21 +08:00
if not base_url :
raise ValueError ( " url cannot be None " )
2025-06-03 14:18:40 +08:00
base_url = urljoin ( base_url , " v1 " )
2024-08-06 16:20:21 +08:00
self . client = OpenAI ( api_key = key , base_url = base_url )
self . model_name = model_name . split ( " ___ " ) [ 0 ]
self . lang = lang
2024-08-12 10:15:21 +08:00
class TogetherAICV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " TogetherAI "
2024-08-15 10:02:36 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = " https://api.together.xyz/v1 " ) :
2024-08-12 10:15:21 +08:00
if not base_url :
base_url = " https://api.together.xyz/v1 "
2025-03-18 14:52:20 +08:00
super ( ) . __init__ ( key , model_name , lang , base_url )
2024-08-15 10:02:36 +08:00
class YiCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " 01.AI "
def __init__ (
self ,
key ,
model_name ,
lang = " Chinese " ,
base_url = " https://api.lingyiwanwu.com/v1 " ,
) :
2024-08-15 10:02:36 +08:00
if not base_url :
base_url = " https://api.lingyiwanwu.com/v1 "
2025-03-18 14:52:20 +08:00
super ( ) . __init__ ( key , model_name , lang , base_url )
2024-08-20 15:27:13 +08:00
2025-03-26 15:37:48 +08:00
class SILICONFLOWCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " SILICONFLOW "
def __init__ (
self ,
key ,
model_name ,
lang = " Chinese " ,
base_url = " https://api.siliconflow.cn/v1 " ,
) :
2025-03-26 15:37:48 +08:00
if not base_url :
base_url = " https://api.siliconflow.cn/v1 "
super ( ) . __init__ ( key , model_name , lang , base_url )
2024-08-20 15:27:13 +08:00
class HunyuanCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Tencent Hunyuan "
2025-03-18 14:52:20 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = None ) :
2024-08-20 15:27:13 +08:00
from tencentcloud . common import credential
from tencentcloud . hunyuan . v20230901 import hunyuan_client
key = json . loads ( key )
sid = key . get ( " hunyuan_sid " , " " )
sk = key . get ( " hunyuan_sk " , " " )
cred = credential . Credential ( sid , sk )
self . model_name = model_name
self . client = hunyuan_client . HunyuanClient ( cred , " " )
self . lang = lang
2025-03-18 14:52:20 +08:00
def describe ( self , image ) :
2024-08-20 15:27:13 +08:00
from tencentcloud . common . exception . tencent_cloud_sdk_exception import (
TencentCloudSDKException ,
)
2025-03-18 14:52:20 +08:00
from tencentcloud . hunyuan . v20230901 import models
2024-08-20 15:27:13 +08:00
b64 = self . image2base64 ( image )
req = models . ChatCompletionsRequest ( )
params = { " Model " : self . model_name , " Messages " : self . prompt ( b64 ) }
req . from_json_string ( json . dumps ( params ) )
ans = " "
try :
response = self . client . ChatCompletions ( req )
ans = response . Choices [ 0 ] . Message . Content
return ans , response . Usage . TotalTokens
except TencentCloudSDKException as e :
return ans + " \n **ERROR**: " + str ( e ) , 0
2025-03-18 14:52:20 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
from tencentcloud . common . exception . tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud . hunyuan . v20230901 import models
b64 = self . image2base64 ( image )
vision_prompt = self . vision_llm_prompt ( b64 , prompt ) if prompt else self . vision_llm_prompt ( b64 )
req = models . ChatCompletionsRequest ( )
params = { " Model " : self . model_name , " Messages " : vision_prompt }
req . from_json_string ( json . dumps ( params ) )
ans = " "
try :
response = self . client . ChatCompletions ( req )
ans = response . Choices [ 0 ] . Message . Content
return ans , response . Usage . TotalTokens
except TencentCloudSDKException as e :
return ans + " \n **ERROR**: " + str ( e ) , 0
2024-08-20 15:27:13 +08:00
def prompt ( self , b64 ) :
return [
{
" Role " : " user " ,
" Contents " : [
{
" Type " : " image_url " ,
2025-07-03 19:05:31 +08:00
" ImageUrl " : { " Url " : f " data:image/jpeg;base64, { b64 } " } ,
2024-08-20 15:27:13 +08:00
} ,
{
" Type " : " text " ,
2025-07-03 19:05:31 +08:00
" Text " : " 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. " ,
2024-08-20 15:27:13 +08:00
} ,
] ,
}
2025-03-18 14:52:20 +08:00
]
2025-03-24 12:34:57 +08:00
class AnthropicCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Anthropic "
2025-03-24 12:34:57 +08:00
def __init__ ( self , key , model_name , base_url = None ) :
import anthropic
self . client = anthropic . Anthropic ( api_key = key )
self . model_name = model_name
self . system = " "
self . max_tokens = 8192
if " haiku " in self . model_name or " opus " in self . model_name :
self . max_tokens = 4096
def prompt ( self , b64 , prompt ) :
return [
{
" role " : " user " ,
" content " : [
{
" type " : " image " ,
" source " : {
" type " : " base64 " ,
" media_type " : " image/jpeg " ,
" data " : b64 ,
} ,
} ,
2025-07-03 19:05:31 +08:00
{ " type " : " text " , " text " : prompt } ,
2025-03-24 12:34:57 +08:00
] ,
}
]
def describe ( self , image ) :
b64 = self . image2base64 ( image )
2025-07-03 19:05:31 +08:00
prompt = self . prompt (
b64 ,
" 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. " ,
2025-03-24 12:34:57 +08:00
)
2025-07-03 19:05:31 +08:00
response = self . client . messages . create ( model = self . model_name , max_tokens = self . max_tokens , messages = prompt )
return response [ " content " ] [ 0 ] [ " text " ] . strip ( ) , response [ " usage " ] [ " input_tokens " ] + response [ " usage " ] [ " output_tokens " ]
2025-03-24 12:34:57 +08:00
def describe_with_prompt ( self , image , prompt = None ) :
b64 = self . image2base64 ( image )
prompt = self . prompt ( b64 , prompt if prompt else vision_llm_describe_prompt ( ) )
2025-07-03 19:05:31 +08:00
response = self . client . messages . create ( model = self . model_name , max_tokens = self . max_tokens , messages = prompt )
return response [ " content " ] [ 0 ] [ " text " ] . strip ( ) , response [ " usage " ] [ " input_tokens " ] + response [ " usage " ] [ " output_tokens " ]
2025-03-24 12:34:57 +08:00
def chat ( self , system , history , gen_conf ) :
if " presence_penalty " in gen_conf :
del gen_conf [ " presence_penalty " ]
if " frequency_penalty " in gen_conf :
del gen_conf [ " frequency_penalty " ]
gen_conf [ " max_tokens " ] = self . max_tokens
ans = " "
try :
response = self . client . messages . create (
model = self . model_name ,
messages = history ,
system = system ,
stream = False ,
* * gen_conf ,
) . to_dict ( )
ans = response [ " content " ] [ 0 ] [ " text " ]
if response [ " stop_reason " ] == " max_tokens " :
2025-07-03 19:05:31 +08:00
ans + = " ... \n For the content length reason, it stopped, continue? " if is_english ( [ ans ] ) else " ······ \n 由于长度的原因,回答被截断了,要继续吗? "
2025-03-24 12:34:57 +08:00
return (
ans ,
response [ " usage " ] [ " input_tokens " ] + response [ " usage " ] [ " output_tokens " ] ,
)
except Exception as e :
return ans + " \n **ERROR**: " + str ( e ) , 0
def chat_streamly ( self , system , history , gen_conf ) :
if " presence_penalty " in gen_conf :
del gen_conf [ " presence_penalty " ]
if " frequency_penalty " in gen_conf :
del gen_conf [ " frequency_penalty " ]
gen_conf [ " max_tokens " ] = self . max_tokens
ans = " "
total_tokens = 0
try :
response = self . client . messages . create (
model = self . model_name ,
messages = history ,
system = system ,
stream = True ,
* * gen_conf ,
)
for res in response :
2025-07-03 19:05:31 +08:00
if res . type == " content_block_delta " :
2025-03-24 12:34:57 +08:00
if res . delta . type == " thinking_delta " and res . delta . thinking :
if ans . find ( " <think> " ) < 0 :
ans + = " <think> "
ans = ans . replace ( " </think> " , " " )
ans + = res . delta . thinking + " </think> "
else :
text = res . delta . text
ans + = text
total_tokens + = num_tokens_from_string ( text )
yield ans
except Exception as e :
yield ans + " \n **ERROR**: " + str ( e )
2025-03-31 15:33:52 +08:00
yield total_tokens
2025-07-03 19:05:31 +08:00
2025-03-31 15:33:52 +08:00
class GPUStackCV ( GptV4 ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " GPUStack "
2025-03-31 15:33:52 +08:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = " " ) :
if not base_url :
raise ValueError ( " Local llm url cannot be None " )
2025-06-03 14:18:40 +08:00
base_url = urljoin ( base_url , " v1 " )
2025-03-31 15:33:52 +08:00
self . client = OpenAI ( api_key = key , base_url = base_url )
self . model_name = model_name
2025-07-02 09:02:01 +07:00
self . lang = lang
class GoogleCV ( Base ) :
2025-07-03 19:05:31 +08:00
_FACTORY_NAME = " Google Cloud "
2025-07-02 09:02:01 +07:00
def __init__ ( self , key , model_name , lang = " Chinese " , base_url = None , * * kwargs ) :
import base64
from google . oauth2 import service_account
2025-07-03 19:05:31 +08:00
2025-07-02 09:02:01 +07:00
key = json . loads ( key )
access_token = json . loads ( base64 . b64decode ( key . get ( " google_service_account_key " , " " ) ) )
project_id = key . get ( " google_project_id " , " " )
region = key . get ( " google_region " , " " )
scopes = [ " https://www.googleapis.com/auth/cloud-platform " ]
self . model_name = model_name
self . lang = lang
if " claude " in self . model_name :
from anthropic import AnthropicVertex
from google . auth . transport . requests import Request
if access_token :
credits = service_account . Credentials . from_service_account_info ( access_token , scopes = scopes )
request = Request ( )
credits . refresh ( request )
token = credits . token
self . client = AnthropicVertex ( region = region , project_id = project_id , access_token = token )
else :
self . client = AnthropicVertex ( region = region , project_id = project_id )
else :
import vertexai . generative_models as glm
from google . cloud import aiplatform
if access_token :
credits = service_account . Credentials . from_service_account_info ( access_token )
aiplatform . init ( credentials = credits , project = project_id , location = region )
else :
aiplatform . init ( project = project_id , location = region )
self . client = glm . GenerativeModel ( model_name = self . model_name )
def describe ( self , image ) :
2025-07-03 19:05:31 +08:00
prompt = (
" 请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。 "
if self . lang . lower ( ) == " chinese "
else " Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out. "
)
2025-07-02 09:02:01 +07:00
if " claude " in self . model_name :
b64 = self . image2base64 ( image )
vision_prompt = [
{
" role " : " user " ,
" content " : [
{
" type " : " image " ,
" source " : {
" type " : " base64 " ,
" media_type " : " image/jpeg " ,
" data " : b64 ,
} ,
} ,
2025-07-03 19:05:31 +08:00
{ " type " : " text " , " text " : prompt } ,
2025-07-02 09:02:01 +07:00
] ,
}
]
response = self . client . messages . create (
model = self . model_name ,
max_tokens = 8192 ,
2025-07-03 19:05:31 +08:00
messages = vision_prompt ,
2025-07-02 09:02:01 +07:00
)
return response . content [ 0 ] . text . strip ( ) , response . usage . input_tokens + response . usage . output_tokens
else :
import vertexai . generative_models as glm
2025-07-03 19:05:31 +08:00
2025-07-02 09:02:01 +07:00
b64 = self . image2base64 ( image )
# Create proper image part for Gemini
2025-07-03 19:05:31 +08:00
image_part = glm . Part . from_data ( data = base64 . b64decode ( b64 ) , mime_type = " image/jpeg " )
2025-07-02 09:02:01 +07:00
input = [ prompt , image_part ]
res = self . client . generate_content ( input )
return res . text , res . usage_metadata . total_token_count
def describe_with_prompt ( self , image , prompt = None ) :
if " claude " in self . model_name :
b64 = self . image2base64 ( image )
vision_prompt = [
{
" role " : " user " ,
" content " : [
{
" type " : " image " ,
" source " : {
" type " : " base64 " ,
" media_type " : " image/jpeg " ,
" data " : b64 ,
} ,
} ,
2025-07-03 19:05:31 +08:00
{ " type " : " text " , " text " : prompt if prompt else vision_llm_describe_prompt ( ) } ,
2025-07-02 09:02:01 +07:00
] ,
}
]
2025-07-03 19:05:31 +08:00
response = self . client . messages . create ( model = self . model_name , max_tokens = 8192 , messages = vision_prompt )
2025-07-02 09:02:01 +07:00
return response . content [ 0 ] . text . strip ( ) , response . usage . input_tokens + response . usage . output_tokens
else :
import vertexai . generative_models as glm
2025-07-03 19:05:31 +08:00
2025-07-02 09:02:01 +07:00
b64 = self . image2base64 ( image )
vision_prompt = prompt if prompt else vision_llm_describe_prompt ( )
# Create proper image part for Gemini
2025-07-03 19:05:31 +08:00
image_part = glm . Part . from_data ( data = base64 . b64decode ( b64 ) , mime_type = " image/jpeg " )
2025-07-02 09:02:01 +07:00
input = [ vision_prompt , image_part ]
res = self . client . generate_content ( input )
return res . text , res . usage_metadata . total_token_count
def chat ( self , system , history , gen_conf , image = " " ) :
if " claude " in self . model_name :
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
try :
for his in history :
if his [ " role " ] == " user " :
his [ " content " ] = [
{
" type " : " image " ,
" source " : {
" type " : " base64 " ,
" media_type " : " image/jpeg " ,
" data " : image ,
} ,
} ,
2025-07-03 19:05:31 +08:00
{ " type " : " text " , " text " : his [ " content " ] } ,
2025-07-02 09:02:01 +07:00
]
2025-07-03 19:05:31 +08:00
response = self . client . messages . create ( model = self . model_name , max_tokens = 8192 , messages = history , temperature = gen_conf . get ( " temperature " , 0.3 ) , top_p = gen_conf . get ( " top_p " , 0.7 ) )
2025-07-02 09:02:01 +07:00
return response . content [ 0 ] . text . strip ( ) , response . usage . input_tokens + response . usage . output_tokens
except Exception as e :
return " **ERROR**: " + str ( e ) , 0
else :
import vertexai . generative_models as glm
from transformers import GenerationConfig
2025-07-03 19:05:31 +08:00
2025-07-02 09:02:01 +07:00
if system :
history [ - 1 ] [ " content " ] = system + history [ - 1 ] [ " content " ] + " user query: " + history [ - 1 ] [ " content " ]
try :
for his in history :
if his [ " role " ] == " assistant " :
his [ " role " ] = " model "
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
if his [ " role " ] == " user " :
his [ " parts " ] = [ his [ " content " ] ]
his . pop ( " content " )
2025-07-03 19:05:31 +08:00
2025-07-02 09:02:01 +07:00
# Create proper image part for Gemini
img_bytes = base64 . b64decode ( image )
2025-07-03 19:05:31 +08:00
image_part = glm . Part . from_data ( data = img_bytes , mime_type = " image/jpeg " )
2025-07-02 09:02:01 +07:00
history [ - 1 ] [ " parts " ] . append ( image_part )
2025-07-03 19:05:31 +08:00
response = self . client . generate_content ( history , generation_config = GenerationConfig ( temperature = gen_conf . get ( " temperature " , 0.3 ) , top_p = gen_conf . get ( " top_p " , 0.7 ) ) )
2025-07-02 09:02:01 +07:00
ans = response . text
return ans , response . usage_metadata . total_token_count
except Exception as e :
2025-07-03 19:05:31 +08:00
return " **ERROR**: " + str ( e ) , 0