2025-02-26 15:40:52 +08:00
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import json
import logging
import re
from collections import defaultdict
2025-03-18 14:52:20 +08:00
2025-02-26 15:40:52 +08:00
import json_repair
2025-03-18 14:52:20 +08:00
2025-03-14 09:54:38 +08:00
from api import settings
2025-02-26 15:40:52 +08:00
from api . db import LLMType
from rag . settings import TAG_FLD
2025-03-18 14:52:20 +08:00
from rag . utils import encoder , num_tokens_from_string
2025-02-26 15:40:52 +08:00
2025-02-26 19:45:22 +08:00
def chunks_format ( reference ) :
def get_value ( d , k1 , k2 ) :
return d . get ( k1 , d . get ( k2 ) )
2025-04-15 09:33:53 +08:00
return [
{
" id " : get_value ( chunk , " chunk_id " , " id " ) ,
" content " : get_value ( chunk , " content " , " content_with_weight " ) ,
" document_id " : get_value ( chunk , " doc_id " , " document_id " ) ,
" document_name " : get_value ( chunk , " docnm_kwd " , " document_name " ) ,
" dataset_id " : get_value ( chunk , " kb_id " , " dataset_id " ) ,
" image_id " : get_value ( chunk , " image_id " , " img_id " ) ,
" positions " : get_value ( chunk , " positions " , " position_int " ) ,
" url " : chunk . get ( " url " ) ,
" similarity " : chunk . get ( " similarity " ) ,
" vector_similarity " : chunk . get ( " vector_similarity " ) ,
" term_similarity " : chunk . get ( " term_similarity " ) ,
2025-05-14 12:24:48 +08:00
" doc_type " : chunk . get ( " doc_type_kwd " ) ,
2025-04-15 09:33:53 +08:00
}
for chunk in reference . get ( " chunks " , [ ] )
]
2025-02-26 19:45:22 +08:00
2025-02-26 15:40:52 +08:00
def llm_id2llm_type ( llm_id ) :
2025-03-18 14:52:20 +08:00
from api . db . services . llm_service import TenantLLMService
2025-04-15 10:20:33 +08:00
llm_id , * _ = TenantLLMService . split_model_name_and_factory ( llm_id )
2025-03-14 09:54:38 +08:00
2025-03-18 14:52:20 +08:00
llm_factories = settings . FACTORY_LLM_INFOS
2025-03-14 09:54:38 +08:00
for llm_factory in llm_factories :
2025-02-26 15:40:52 +08:00
for llm in llm_factory [ " llm " ] :
if llm_id == llm [ " llm_name " ] :
return llm [ " model_type " ] . strip ( " , " ) [ - 1 ]
def message_fit_in ( msg , max_length = 4000 ) :
def count ( ) :
nonlocal msg
tks_cnts = [ ]
for m in msg :
2025-04-15 09:33:53 +08:00
tks_cnts . append ( { " role " : m [ " role " ] , " count " : num_tokens_from_string ( m [ " content " ] ) } )
2025-02-26 15:40:52 +08:00
total = 0
for m in tks_cnts :
total + = m [ " count " ]
return total
c = count ( )
if c < max_length :
return c , msg
2025-03-07 16:33:25 +08:00
msg_ = [ m for m in msg if m [ " role " ] == " system " ]
2025-02-26 15:40:52 +08:00
if len ( msg ) > 1 :
msg_ . append ( msg [ - 1 ] )
msg = msg_
c = count ( )
if c < max_length :
return c , msg
ll = num_tokens_from_string ( msg_ [ 0 ] [ " content " ] )
ll2 = num_tokens_from_string ( msg_ [ - 1 ] [ " content " ] )
if ll / ( ll + ll2 ) > 0.8 :
m = msg_ [ 0 ] [ " content " ]
2025-04-15 09:33:53 +08:00
m = encoder . decode ( encoder . encode ( m ) [ : max_length - ll2 ] )
2025-02-26 15:40:52 +08:00
msg [ 0 ] [ " content " ] = m
return max_length , msg
2025-03-07 17:41:56 +08:00
m = msg_ [ - 1 ] [ " content " ]
2025-04-15 09:33:53 +08:00
m = encoder . decode ( encoder . encode ( m ) [ : max_length - ll2 ] )
2025-03-07 17:41:56 +08:00
msg [ - 1 ] [ " content " ] = m
2025-02-26 15:40:52 +08:00
return max_length , msg
def kb_prompt ( kbinfos , max_tokens ) :
2025-03-18 14:52:20 +08:00
from api . db . services . document_service import DocumentService
2025-02-26 15:40:52 +08:00
knowledges = [ ck [ " content_with_weight " ] for ck in kbinfos [ " chunks " ] ]
used_token_count = 0
chunks_num = 0
for i , c in enumerate ( knowledges ) :
used_token_count + = num_tokens_from_string ( c )
chunks_num + = 1
if max_tokens * 0.97 < used_token_count :
knowledges = knowledges [ : i ]
2025-04-15 09:33:53 +08:00
logging . warning ( f " Not all the retrieval into prompt: { i + 1 } / { len ( knowledges ) } " )
2025-02-26 15:40:52 +08:00
break
docs = DocumentService . get_by_ids ( [ ck [ " doc_id " ] for ck in kbinfos [ " chunks " ] [ : chunks_num ] ] )
docs = { d . id : d . meta_fields for d in docs }
doc2chunks = defaultdict ( lambda : { " chunks " : [ ] , " meta " : [ ] } )
2025-03-11 19:56:21 +08:00
for i , ck in enumerate ( kbinfos [ " chunks " ] [ : chunks_num ] ) :
2025-05-13 19:30:05 +08:00
cnt = f " --- \n ID: { i } \n " + ( f " URL: { ck [ ' url ' ] } \n " if " url " in ck else " " )
cnt + = ck [ " content_with_weight " ]
doc2chunks [ ck [ " docnm_kwd " ] ] [ " chunks " ] . append ( cnt )
2025-02-26 15:40:52 +08:00
doc2chunks [ ck [ " docnm_kwd " ] ] [ " meta " ] = docs . get ( ck [ " doc_id " ] , { } )
knowledges = [ ]
for nm , cks_meta in doc2chunks . items ( ) :
2025-03-11 19:56:21 +08:00
txt = f " \n Document: { nm } \n "
2025-02-26 15:40:52 +08:00
for k , v in cks_meta [ " meta " ] . items ( ) :
txt + = f " { k } : { v } \n "
txt + = " Relevant fragments as following: \n "
for i , chunk in enumerate ( cks_meta [ " chunks " ] , 1 ) :
2025-03-11 19:56:21 +08:00
txt + = f " { chunk } \n "
2025-02-26 15:40:52 +08:00
knowledges . append ( txt )
return knowledges
2025-03-11 19:56:21 +08:00
def citation_prompt ( ) :
2025-05-29 10:03:51 +08:00
print ( " USE PROMPT " , flush = True )
2025-03-11 19:56:21 +08:00
return """
# Citation requirements:
2025-05-29 10:03:51 +08:00
- Use a uniform citation format of like [ ID : i ] [ ID : j ] , where " i " and " j " are the document ID enclosed in square brackets . Separate multiple IDs with spaces ( e . g . , [ ID : 0 ] [ ID : 1 ] ) .
- Citation markers must be placed at the end of a sentence , separated by a space from the final punctuation ( e . g . , period , question mark ) . A maximum of 4 citations are allowed per sentence .
2025-03-11 19:56:21 +08:00
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks .
2025-04-15 09:33:53 +08:00
- DO NOT use standalone Document IDs ( e . g . , ' #ID# ' ) .
2025-05-29 10:03:51 +08:00
- Citations ALWAYS in the " [ID:i] " format .
- STRICTLY prohibit the use of strikethrough symbols ( e . g . , ~ ~ ) or any other non - standard formatting syntax .
- Any failure to adhere to the above rules , including but not limited to incorrect formatting , use of prohibited styles , or unsupported citations , will be considered an error , and no citation will be added for that sentence .
2025-03-11 19:56:21 +08:00
- - - Example START - - -
< SYSTEM > : Here is the knowledge base :
Document : Elon Musk Breaks Silence on Crypto , Warns Against Dogecoin . . .
URL : https : / / blockworks . co / news / elon - musk - crypto - dogecoin
ID : 0
The Tesla co - founder advised against going all - in on dogecoin , but Elon Musk said it ’ s still his favorite crypto . . .
Document : Elon Musk ' s Dogecoin tweet sparks social media frenzy
ID : 1
Musk said he is ' willing to serve ' D . O . G . E . – shorthand for Dogecoin .
Document : Causal effect of Elon Musk tweets on Dogecoin price
ID : 2
If you think of Dogecoin — the cryptocurrency based on a meme — you can ’ t help but also think of Elon Musk . . .
Document : Elon Musk ' s Tweet Ignites Dogecoin ' s Future In Public Services
ID : 3
The market is heating up after Elon Musk ' s announcement about Dogecoin. Is this a new era for crypto?...
The above is the knowledge base .
< USER > : What ' s the Elon ' s view on dogecoin ?
2025-05-29 10:03:51 +08:00
< ASSISTANT > : Musk has consistently expressed his fondness for Dogecoin , often citing its humor and the inclusion of dogs in its branding . He has referred to it as his favorite cryptocurrency [ ID : 0 ] [ ID : 1 ] .
Recently , Musk has hinted at potential future roles for Dogecoin . His tweets have sparked speculation about Dogecoin ' s potential integration into public services [ID:3].
2025-03-11 19:56:21 +08:00
Overall , while Musk enjoys Dogecoin and often promotes it , he also warns against over - investing in it , reflecting both his personal amusement and caution regarding its speculative nature .
- - - Example END - - -
"""
2025-02-26 15:40:52 +08:00
def keyword_extraction ( chat_mdl , content , topn = 3 ) :
prompt = f """
2025-03-18 14:52:20 +08:00
Role : You ' re a text analyzer.
2025-02-26 15:40:52 +08:00
Task : extract the most important keywords / phrases of a given piece of text content .
2025-03-18 14:52:20 +08:00
Requirements :
2025-02-26 15:40:52 +08:00
- Summarize the text content , and give top { topn } important keywords / phrases .
- The keywords MUST be in language of the given piece of text content .
- The keywords are delimited by ENGLISH COMMA .
- Keywords ONLY in output .
2025-03-18 14:52:20 +08:00
### Text Content
2025-02-26 15:40:52 +08:00
{ content }
"""
2025-04-15 09:33:53 +08:00
msg = [ { " role " : " system " , " content " : prompt } , { " role " : " user " , " content " : " Output: " } ]
2025-02-26 15:40:52 +08:00
_ , msg = message_fit_in ( msg , chat_mdl . max_length )
kwd = chat_mdl . chat ( prompt , msg [ 1 : ] , { " temperature " : 0.2 } )
if isinstance ( kwd , tuple ) :
kwd = kwd [ 0 ]
2025-04-24 11:44:10 +08:00
kwd = re . sub ( r " ^.*</think> " , " " , kwd , flags = re . DOTALL )
2025-02-26 15:40:52 +08:00
if kwd . find ( " **ERROR** " ) > = 0 :
return " "
return kwd
def question_proposal ( chat_mdl , content , topn = 3 ) :
prompt = f """
2025-03-18 14:52:20 +08:00
Role : You ' re a text analyzer.
2025-02-26 15:40:52 +08:00
Task : propose { topn } questions about a given piece of text content .
2025-03-18 14:52:20 +08:00
Requirements :
2025-02-26 15:40:52 +08:00
- Understand and summarize the text content , and propose top { topn } important questions .
- The questions SHOULD NOT have overlapping meanings .
- The questions SHOULD cover the main content of the text as much as possible .
- The questions MUST be in language of the given piece of text content .
- One question per line .
- Question ONLY in output .
2025-03-18 14:52:20 +08:00
### Text Content
2025-02-26 15:40:52 +08:00
{ content }
"""
2025-04-15 09:33:53 +08:00
msg = [ { " role " : " system " , " content " : prompt } , { " role " : " user " , " content " : " Output: " } ]
2025-02-26 15:40:52 +08:00
_ , msg = message_fit_in ( msg , chat_mdl . max_length )
kwd = chat_mdl . chat ( prompt , msg [ 1 : ] , { " temperature " : 0.2 } )
if isinstance ( kwd , tuple ) :
kwd = kwd [ 0 ]
2025-04-24 11:44:10 +08:00
kwd = re . sub ( r " ^.*</think> " , " " , kwd , flags = re . DOTALL )
2025-02-26 15:40:52 +08:00
if kwd . find ( " **ERROR** " ) > = 0 :
return " "
return kwd
2025-03-07 13:48:13 +08:00
def full_question ( tenant_id , llm_id , messages , language = None ) :
2025-03-18 14:52:20 +08:00
from api . db . services . llm_service import LLMBundle
2025-02-26 15:40:52 +08:00
if llm_id2llm_type ( llm_id ) == " image2text " :
chat_mdl = LLMBundle ( tenant_id , LLMType . IMAGE2TEXT , llm_id )
else :
chat_mdl = LLMBundle ( tenant_id , LLMType . CHAT , llm_id )
conv = [ ]
for m in messages :
if m [ " role " ] not in [ " user " , " assistant " ] :
continue
conv . append ( " {} : {} " . format ( m [ " role " ] . upper ( ) , m [ " content " ] ) )
conv = " \n " . join ( conv )
today = datetime . date . today ( ) . isoformat ( )
yesterday = ( datetime . date . today ( ) - datetime . timedelta ( days = 1 ) ) . isoformat ( )
tomorrow = ( datetime . date . today ( ) + datetime . timedelta ( days = 1 ) ) . isoformat ( )
prompt = f """
Role : A helpful assistant
2025-03-18 14:52:20 +08:00
Task and steps :
2025-02-26 15:40:52 +08:00
1. Generate a full user question that would follow the conversation .
2. If the user ' s question involves relative date, you need to convert it into absolute date based on the current date, which is {today} . For example: ' yesterday ' would be converted to {yesterday} .
Requirements & Restrictions :
- If the user ' s latest question is completely, don ' t do anything , just return the original question .
2025-03-07 13:48:13 +08:00
- DON ' T generate anything except a refined question. " " "
if language :
prompt + = f """
- Text generated MUST be in { language } . """
else :
prompt + = """
- Text generated MUST be in the same language of the original user ' s question.
"""
prompt + = f """
2025-02-26 15:40:52 +08:00
######################
- Examples -
######################
# Example 1
## Conversation
USER : What is the name of Donald Trump ' s father?
ASSISTANT : Fred Trump .
USER : And his mother ?
###############
Output : What ' s the name of Donald Trump ' s mother ?
- - - - - - - - - - - -
# Example 2
## Conversation
USER : What is the name of Donald Trump ' s father?
ASSISTANT : Fred Trump .
USER : And his mother ?
ASSISTANT : Mary Trump .
User : What ' s her full name?
###############
Output : What ' s the full name of Donald Trump ' s mother Mary Trump ?
- - - - - - - - - - - -
# Example 3
## Conversation
USER : What ' s the weather today in London?
ASSISTANT : Cloudy .
USER : What ' s about tomorrow in Rochester?
###############
Output : What ' s the weather in Rochester on {tomorrow} ?
2025-03-07 13:48:13 +08:00
######################
2025-02-26 15:40:52 +08:00
# Real Data
## Conversation
{ conv }
###############
"""
ans = chat_mdl . chat ( prompt , [ { " role " : " user " , " content " : " Output: " } ] , { " temperature " : 0.2 } )
2025-04-24 11:44:10 +08:00
ans = re . sub ( r " ^.*</think> " , " " , ans , flags = re . DOTALL )
2025-02-26 15:40:52 +08:00
return ans if ans . find ( " **ERROR** " ) < 0 else messages [ - 1 ] [ " content " ]
2025-05-29 10:03:51 +08:00
2025-05-09 15:32:02 +08:00
def cross_languages ( tenant_id , llm_id , query , languages = [ ] ) :
from api . db . services . llm_service import LLMBundle
if llm_id and llm_id2llm_type ( llm_id ) == " image2text " :
chat_mdl = LLMBundle ( tenant_id , LLMType . IMAGE2TEXT , llm_id )
else :
chat_mdl = LLMBundle ( tenant_id , LLMType . CHAT , llm_id )
sys_prompt = """
Act as a streamlined multilingual translator . Strictly output translations separated by ### without any explanations or formatting. Follow these rules:
1. Accept batch translation requests in format :
[ source text ]
== =
[ target languages separated by commas ]
2. Always maintain :
- Original formatting ( tables / lists / spacing )
- Technical terminology accuracy
- Cultural context appropriateness
3. Output format :
[ language1 translation ]
###
[ language1 translation ]
* * Examples : * *
Input :
Hello World ! Let ' s discuss AI safety.
== =
Chinese , French , Jappanese
Output :
你好世界 ! 让我们讨论人工智能安全问题 。
###
Bonjour le monde ! Parlons de la sécurité de l ' IA.
###
こんにちは世界 ! AIの安全性について話し合いましょう 。
"""
2025-05-29 10:03:51 +08:00
user_prompt = f """
2025-05-09 15:32:02 +08:00
Input :
{ query }
== =
2025-05-29 10:03:51 +08:00
{ " , " . join ( languages ) }
2025-05-09 15:32:02 +08:00
Output :
"""
ans = chat_mdl . chat ( sys_prompt , [ { " role " : " user " , " content " : user_prompt } ] , { " temperature " : 0.2 } )
ans = re . sub ( r " ^.*</think> " , " " , ans , flags = re . DOTALL )
if ans . find ( " **ERROR** " ) > = 0 :
return query
return " \n " . join ( [ a for a in re . sub ( r " (^Output:| \ n+) " , " " , ans , flags = re . DOTALL ) . split ( " === " ) if a . strip ( ) ] )
2025-02-26 15:40:52 +08:00
def content_tagging ( chat_mdl , content , all_tags , examples , topn = 3 ) :
prompt = f """
2025-03-18 14:52:20 +08:00
Role : You ' re a text analyzer.
2025-02-26 15:40:52 +08:00
Task : Tag ( put on some labels ) to a given piece of text content based on the examples and the entire tag set .
2025-03-18 14:52:20 +08:00
Steps : :
2025-02-26 15:40:52 +08:00
- Comprehend the tag / label set .
- Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON .
- Summarize the text content , and tag it with top { topn } most relevant tags from the set of tag / label and the corresponding relevance score .
Requirements
- The tags MUST be from the tag set .
- The output MUST be in JSON format only , the key is tag and the value is its relevance score .
- The relevance score must be range from 1 to 10.
- Keywords ONLY in output .
# TAG SET
{ " , " . join ( all_tags ) }
"""
for i , ex in enumerate ( examples ) :
prompt + = """
# Examples {}
### Text Content
{ }
Output :
{ }
""" .format(i, ex[ " content " ], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))
prompt + = f """
# Real Data
### Text Content
{ content }
"""
2025-04-15 09:33:53 +08:00
msg = [ { " role " : " system " , " content " : prompt } , { " role " : " user " , " content " : " Output: " } ]
2025-02-26 15:40:52 +08:00
_ , msg = message_fit_in ( msg , chat_mdl . max_length )
kwd = chat_mdl . chat ( prompt , msg [ 1 : ] , { " temperature " : 0.5 } )
if isinstance ( kwd , tuple ) :
kwd = kwd [ 0 ]
2025-04-24 11:44:10 +08:00
kwd = re . sub ( r " ^.*</think> " , " " , kwd , flags = re . DOTALL )
2025-02-26 15:40:52 +08:00
if kwd . find ( " **ERROR** " ) > = 0 :
raise Exception ( kwd )
try :
2025-04-25 14:38:34 +08:00
obj = json_repair . loads ( kwd )
2025-02-26 15:40:52 +08:00
except json_repair . JSONDecodeError :
try :
2025-04-15 09:33:53 +08:00
result = kwd . replace ( prompt [ : - 1 ] , " " ) . replace ( " user " , " " ) . replace ( " model " , " " ) . strip ( )
result = " { " + result . split ( " { " ) [ 1 ] . split ( " } " ) [ 0 ] + " } "
2025-04-25 14:38:34 +08:00
obj = json_repair . loads ( result )
2025-02-26 15:40:52 +08:00
except Exception as e :
logging . exception ( f " JSON parsing error: { result } -> { e } " )
raise e
2025-04-25 14:38:34 +08:00
res = { }
for k , v in obj . items ( ) :
try :
res [ str ( k ) ] = int ( v )
except Exception :
pass
return res
2025-03-18 14:52:20 +08:00
def vision_llm_describe_prompt ( page = None ) - > str :
prompt_en = """
INSTRUCTION :
Transcribe the content from the provided PDF page image into clean Markdown format .
- Only output the content transcribed from the image .
- Do NOT output this instruction or any other explanation .
- If the content is missing or you do not understand the input , return an empty string .
RULES :
1. Do NOT generate examples , demonstrations , or templates .
2. Do NOT output any extra text such as ' Example ' , ' Example Output ' , or similar .
3. Do NOT generate any tables , headings , or content that is not explicitly present in the image .
4. Transcribe content word - for - word . Do NOT modify , translate , or omit any content .
5. Do NOT explain Markdown or mention that you are using Markdown .
6. Do NOT wrap the output in ` ` ` markdown or ` ` ` blocks .
7. Only apply Markdown structure to headings , paragraphs , lists , and tables , strictly based on the layout of the image . Do NOT create tables unless an actual table exists in the image .
8. Preserve the original language , information , and order exactly as shown in the image .
"""
if page is not None :
prompt_en + = f " \n At the end of the transcription, add the page divider: `--- Page { page } ---`. "
prompt_en + = """
FAILURE HANDLING :
- If you do not detect valid content in the image , return an empty string .
"""
return prompt_en
2025-03-20 09:39:32 +08:00
def vision_llm_figure_describe_prompt ( ) - > str :
prompt = """
You are an expert visual data analyst . Analyze the image and provide a comprehensive description of its content . Focus on identifying the type of visual data representation ( e . g . , bar chart , pie chart , line graph , table , flowchart ) , its structure , and any text captions or labels included in the image .
Tasks :
1. Describe the overall structure of the visual representation . Specify if it is a chart , graph , table , or diagram .
2. Identify and extract any axes , legends , titles , or labels present in the image . Provide the exact text where available .
3. Extract the data points from the visual elements ( e . g . , bar heights , line graph coordinates , pie chart segments , table rows and columns ) .
4. Analyze and explain any trends , comparisons , or patterns shown in the data .
5. Capture any annotations , captions , or footnotes , and explain their relevance to the image .
6. Only include details that are explicitly present in the image . If an element ( e . g . , axis , legend , or caption ) does not exist or is not visible , do not mention it .
Output format ( include only sections relevant to the image content ) :
- Visual Type : [ Type ]
- Title : [ Title text , if available ]
- Axes / Legends / Labels : [ Details , if available ]
- Data Points : [ Extracted data ]
- Trends / Insights : [ Analysis and interpretation ]
- Captions / Annotations : [ Text and relevance , if available ]
Ensure high accuracy , clarity , and completeness in your analysis , and includes only the information present in the image . Avoid unnecessary statements about missing elements .
"""
return prompt