ragflow/rag/prompts.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import datetime
import json
import logging
import re
from collections import defaultdict

import json_repair

from api import settings
from api.db import LLMType
from rag.settings import TAG_FLD
from rag.utils import encoder, num_tokens_from_string


def chunks_format(reference):
    def get_value(d, k1, k2):
        return d.get(k1, d.get(k2))

    return [
        {
            "id": get_value(chunk, "chunk_id", "id"),
            "content": get_value(chunk, "content", "content_with_weight"),
            "document_id": get_value(chunk, "doc_id", "document_id"),
            "document_name": get_value(chunk, "docnm_kwd", "document_name"),
            "dataset_id": get_value(chunk, "kb_id", "dataset_id"),
            "image_id": get_value(chunk, "image_id", "img_id"),
            "positions": get_value(chunk, "positions", "position_int"),
            "url": chunk.get("url"),
            "similarity": chunk.get("similarity"),
            "vector_similarity": chunk.get("vector_similarity"),
            "term_similarity": chunk.get("term_similarity"),
            "doc_type": chunk.get("doc_type_kwd"),
        }
        for chunk in reference.get("chunks", [])
    ]


def llm_id2llm_type(llm_id):
    from api.db.services.llm_service import TenantLLMService

    llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)

    llm_factories = settings.FACTORY_LLM_INFOS
    for llm_factory in llm_factories:
        for llm in llm_factory["llm"]:
            if llm_id == llm["llm_name"]:
                return llm["model_type"].strip(",")[-1]


def message_fit_in(msg, max_length=4000):
    def count():
        nonlocal msg
        tks_cnts = []
        for m in msg:
            tks_cnts.append({"role": m["role"], "count": num_tokens_from_string(m["content"])})
        total = 0
        for m in tks_cnts:
            total += m["count"]
        return total

    c = count()
    if c < max_length:
        return c, msg

    msg_ = [m for m in msg if m["role"] == "system"]
    if len(msg) > 1:
        msg_.append(msg[-1])
    msg = msg_
    c = count()
    if c < max_length:
        return c, msg

    ll = num_tokens_from_string(msg_[0]["content"])
    ll2 = num_tokens_from_string(msg_[-1]["content"])
    if ll / (ll + ll2) > 0.8:
        m = msg_[0]["content"]
        m = encoder.decode(encoder.encode(m)[: max_length - ll2])
        msg[0]["content"] = m
        return max_length, msg

    m = msg_[-1]["content"]
    m = encoder.decode(encoder.encode(m)[: max_length - ll2])
    msg[-1]["content"] = m
    return max_length, msg


def kb_prompt(kbinfos, max_tokens):
    from api.db.services.document_service import DocumentService

    knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
    used_token_count = 0
    chunks_num = 0
    for i, c in enumerate(knowledges):
        used_token_count += num_tokens_from_string(c)
        chunks_num += 1
        if max_tokens * 0.97 < used_token_count:
            knowledges = knowledges[:i]
            logging.warning(f"Not all the retrieval into prompt: {i + 1}/{len(knowledges)}")
            break

    docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
    docs = {d.id: d.meta_fields for d in docs}

    doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
    for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
        cnt = f"---\nID: {i}\n" + (f"URL: {ck['url']}\n" if "url" in ck else "")
        cnt += ck["content_with_weight"]
        doc2chunks[ck["docnm_kwd"]]["chunks"].append(cnt)
        doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})

    knowledges = []
    for nm, cks_meta in doc2chunks.items():
        txt = f"\nDocument: {nm} \n"
        for k, v in cks_meta["meta"].items():
            txt += f"{k}: {v}\n"
        txt += "Relevant fragments as following:\n"
        for i, chunk in enumerate(cks_meta["chunks"], 1):
            txt += f"{chunk}\n"
        knowledges.append(txt)
    return knowledges


def citation_prompt():
    print("USE PROMPT", flush=True)
    return """

# Citation requirements:

- Use a uniform citation format of like [ID:i] [ID:j], where "i" and "j" are the document ID enclosed in square brackets. Separate multiple IDs with spaces (e.g., [ID:0] [ID:1]).
- Citation markers must be placed at the end of a sentence, separated by a space from the final punctuation (e.g., period, question mark). A maximum of 4 citations are allowed per sentence.
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
- DO NOT use standalone Document IDs (e.g., '#ID#').
- Citations ALWAYS in the "[ID:i]" format.
- STRICTLY prohibit the use of strikethrough symbols (e.g., ~~) or any other non-standard formatting syntax.
- Any failure to adhere to the above rules, including but not limited to incorrect formatting, use of prohibited styles, or unsupported citations, will be considered an error, and no citation will be added for that sentence.

--- Example START ---
<SYSTEM>: Here is the knowledge base:

Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
ID: 0
The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...

Document: Elon Musk's Dogecoin tweet sparks social media frenzy
ID: 1
Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.

Document: Causal effect of Elon Musk tweets on Dogecoin price
ID: 2
If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...

Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
ID: 3
The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...

      The above is the knowledge base.

<USER>: What's the Elon's view on dogecoin?

<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency [ID:0] [ID:1].
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services [ID:3].
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.

--- Example END ---

"""


def keyword_extraction(chat_mdl, content, topn=3):
    prompt = f"""
Role: You're a text analyzer.
Task: extract the most important keywords/phrases of a given piece of text content.
Requirements:
  - Summarize the text content, and give top {topn} important keywords/phrases.
  - The keywords MUST be in language of the given piece of text content.
  - The keywords are delimited by ENGLISH COMMA.
  - Keywords ONLY in output.

### Text Content
{content}

"""
    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
    _, msg = message_fit_in(msg, chat_mdl.max_length)
    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
    if isinstance(kwd, tuple):
        kwd = kwd[0]
    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
    if kwd.find("**ERROR**") >= 0:
        return ""
    return kwd


def question_proposal(chat_mdl, content, topn=3):
    prompt = f"""
Role: You're a text analyzer.
Task:  propose {topn} questions about a given piece of text content.
Requirements:
  - Understand and summarize the text content, and propose top {topn} important questions.
  - The questions SHOULD NOT have overlapping meanings.
  - The questions SHOULD cover the main content of the text as much as possible.
  - The questions MUST be in language of the given piece of text content.
  - One question per line.
  - Question ONLY in output.

### Text Content
{content}

"""
    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
    _, msg = message_fit_in(msg, chat_mdl.max_length)
    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
    if isinstance(kwd, tuple):
        kwd = kwd[0]
    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
    if kwd.find("**ERROR**") >= 0:
        return ""
    return kwd


def full_question(tenant_id, llm_id, messages, language=None):
    from api.db.services.llm_service import LLMBundle

    if llm_id2llm_type(llm_id) == "image2text":
        chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
    else:
        chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
    conv = []
    for m in messages:
        if m["role"] not in ["user", "assistant"]:
            continue
        conv.append("{}: {}".format(m["role"].upper(), m["content"]))
    conv = "\n".join(conv)
    today = datetime.date.today().isoformat()
    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).isoformat()
    tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
    prompt = f"""
Role: A helpful assistant

Task and steps:
    1. Generate a full user question that would follow the conversation.
    2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}.

Requirements & Restrictions:
  - If the user's latest question is completely, don't do anything, just return the original question.
  - DON'T generate anything except a refined question."""
    if language:
        prompt += f"""
  - Text generated MUST be in {language}."""
    else:
        prompt += """
  - Text generated MUST be in the same language of the original user's question.
"""
    prompt += f"""

######################
-Examples-
######################

# Example 1
## Conversation
USER: What is the name of Donald Trump's father?
ASSISTANT:  Fred Trump.
USER: And his mother?
###############
Output: What's the name of Donald Trump's mother?

------------
# Example 2
## Conversation
USER: What is the name of Donald Trump's father?
ASSISTANT:  Fred Trump.
USER: And his mother?
ASSISTANT:  Mary Trump.
User: What's her full name?
###############
Output: What's the full name of Donald Trump's mother Mary Trump?

------------
# Example 3
## Conversation
USER: What's the weather today in London?
ASSISTANT:  Cloudy.
USER: What's about tomorrow in Rochester?
###############
Output: What's the weather in Rochester on {tomorrow}?

######################
# Real Data
## Conversation
{conv}
###############
    """
    ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2})
    ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
    return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]


def cross_languages(tenant_id, llm_id, query, languages=[]):
    from api.db.services.llm_service import LLMBundle

    if llm_id and llm_id2llm_type(llm_id) == "image2text":
        chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
    else:
        chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)

    sys_prompt = """
Act as a streamlined multilingual translator. Strictly output translations separated by ### without any explanations or formatting. Follow these rules:

1. Accept batch translation requests in format:
[source text]
=== 
[target languages separated by commas]

2. Always maintain:
- Original formatting (tables/lists/spacing)
- Technical terminology accuracy
- Cultural context appropriateness

3. Output format:
[language1 translation] 
### 
[language1 translation]

**Examples:**
Input:
Hello World! Let's discuss AI safety.
===
Chinese, French, Jappanese

Output:
你好世界！让我们讨论人工智能安全问题。
###
Bonjour le monde ! Parlons de la sécurité de l'IA.
###
こんにちは世界！AIの安全性について話し合いましょう。
"""
    user_prompt = f"""
Input:
{query}
===
{", ".join(languages)}

Output:
"""

    ans = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_prompt}], {"temperature": 0.2})
    ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
    if ans.find("**ERROR**") >= 0:
        return query
    return "\n".join([a for a in re.sub(r"(^Output:|\n+)", "", ans, flags=re.DOTALL).split("===") if a.strip()])


def content_tagging(chat_mdl, content, all_tags, examples, topn=3):
    prompt = f"""
Role: You're a text analyzer.

Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set.

Steps::
  - Comprehend the tag/label set.
  - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON.
  - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score.

Requirements
  - The tags MUST be from the tag set.
  - The output MUST be in JSON format only, the key is tag and the value is its relevance score.
  - The relevance score must be range from 1 to 10.
  - Keywords ONLY in output.

# TAG SET
{", ".join(all_tags)}

"""
    for i, ex in enumerate(examples):
        prompt += """
# Examples {}
### Text Content
{}

Output:
{}

        """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))

    prompt += f"""
# Real Data
### Text Content
{content}

"""
    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
    _, msg = message_fit_in(msg, chat_mdl.max_length)
    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5})
    if isinstance(kwd, tuple):
        kwd = kwd[0]
    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
    if kwd.find("**ERROR**") >= 0:
        raise Exception(kwd)

    try:
        obj = json_repair.loads(kwd)
    except json_repair.JSONDecodeError:
        try:
            result = kwd.replace(prompt[:-1], "").replace("user", "").replace("model", "").strip()
            result = "{" + result.split("{")[1].split("}")[0] + "}"
            obj = json_repair.loads(result)
        except Exception as e:
            logging.exception(f"JSON parsing error: {result} -> {e}")
            raise e
    res = {}
    for k, v in obj.items():
        try:
            res[str(k)] = int(v)
        except Exception:
            pass
    return res


def vision_llm_describe_prompt(page=None) -> str:
    prompt_en = """
INSTRUCTION:
Transcribe the content from the provided PDF page image into clean Markdown format.
- Only output the content transcribed from the image.
- Do NOT output this instruction or any other explanation.
- If the content is missing or you do not understand the input, return an empty string.

RULES:
1. Do NOT generate examples, demonstrations, or templates.
2. Do NOT output any extra text such as 'Example', 'Example Output', or similar.
3. Do NOT generate any tables, headings, or content that is not explicitly present in the image.
4. Transcribe content word-for-word. Do NOT modify, translate, or omit any content.
5. Do NOT explain Markdown or mention that you are using Markdown.
6. Do NOT wrap the output in ```markdown or ``` blocks.
7. Only apply Markdown structure to headings, paragraphs, lists, and tables, strictly based on the layout of the image. Do NOT create tables unless an actual table exists in the image.
8. Preserve the original language, information, and order exactly as shown in the image.
"""

    if page is not None:
        prompt_en += f"\nAt the end of the transcription, add the page divider: `--- Page {page} ---`."

    prompt_en += """
FAILURE HANDLING:
- If you do not detect valid content in the image, return an empty string.
"""
    return prompt_en


def vision_llm_figure_describe_prompt() -> str:
    prompt = """
You are an expert visual data analyst. Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.

Tasks:
1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
4. Analyze and explain any trends, comparisons, or patterns shown in the data.
5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.

Output format (include only sections relevant to the image content):
- Visual Type: [Type]
- Title: [Title text, if available]
- Axes / Legends / Labels: [Details, if available]
- Data Points: [Extracted data]
- Trends / Insights: [Analysis and interpretation]
- Captions / Annotations: [Text and relevance, if available]

Ensure high accuracy, clarity, and completeness in your analysis, and includes only the information present in the image. Avoid unnecessary statements about missing elements.
"""
    return prompt
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								#
 								#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 								#
 								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
 								#
 								import datetime
 								import json
 								import logging
 								import re
 								from collections import defaultdict
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								import json_repair
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
-												Fix: Reduce excessive IO operations by loading LLM factory configurations (#6047)

…ions

### What problem does this PR solve?

This PR fixes an issue where the application was repeatedly reading the
llm_factories.json file from disk in multiple places, which could lead
to "Too many open files" errors under high load conditions. The fix
centralizes the file reading operation in the settings.py module and
stores the data in a global variable that can be accessed by other
modules.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2025-03-14 09:54:38 +08:00
+								from api import settings
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								from api.db import LLMType
 								from rag.settings import TAG_FLD
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								from rag.utils import encoder, num_tokens_from_string
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
-												Fix issue of `ask` API. (#5400)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-02-26 19:45:22 +08:00
+								def chunks_format(reference):
 								    def get_value(d, k1, k2):
 								        return d.get(k1, d.get(k2))
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								    return [
 								        {
 								            "id": get_value(chunk, "chunk_id", "id"),
 								            "content": get_value(chunk, "content", "content_with_weight"),
 								            "document_id": get_value(chunk, "doc_id", "document_id"),
 								            "document_name": get_value(chunk, "docnm_kwd", "document_name"),
 								            "dataset_id": get_value(chunk, "kb_id", "dataset_id"),
 								            "image_id": get_value(chunk, "image_id", "img_id"),
 								            "positions": get_value(chunk, "positions", "position_int"),
 								            "url": chunk.get("url"),
 								            "similarity": chunk.get("similarity"),
 								            "vector_similarity": chunk.get("vector_similarity"),
 								            "term_similarity": chunk.get("term_similarity"),
-												Fix: docx get image exception. (#7636)

### What problem does this PR solve?

Close #7631

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-05-14 12:24:48 +08:00
+								            "doc_type": chunk.get("doc_type_kwd"),
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								        }
 								        for chunk in reference.get("chunks", [])
 								    ]
-												Fix issue of `ask` API. (#5400)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-02-26 19:45:22 +08:00
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								def llm_id2llm_type(llm_id):
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								    from api.db.services.llm_service import TenantLLMService
-												Refa. (#7022)

### What problem does this PR solve?


### Type of change

- [x] Refactoring
											
										
										
											2025-04-15 10:20:33 +08:00
+								    llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)
-												Fix: Reduce excessive IO operations by loading LLM factory configurations (#6047)

…ions

### What problem does this PR solve?

This PR fixes an issue where the application was repeatedly reading the
llm_factories.json file from disk in multiple places, which could lead
to "Too many open files" errors under high load conditions. The fix
centralizes the file reading operation in the settings.py module and
stores the data in a global variable that can be accessed by other
modules.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2025-03-14 09:54:38 +08:00
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								    llm_factories = settings.FACTORY_LLM_INFOS
-												Fix: Reduce excessive IO operations by loading LLM factory configurations (#6047)

…ions

### What problem does this PR solve?

This PR fixes an issue where the application was repeatedly reading the
llm_factories.json file from disk in multiple places, which could lead
to "Too many open files" errors under high load conditions. The fix
centralizes the file reading operation in the settings.py module and
stores the data in a global variable that can be accessed by other
modules.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2025-03-14 09:54:38 +08:00
+								    for llm_factory in llm_factories:
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        for llm in llm_factory["llm"]:
 								            if llm_id == llm["llm_name"]:
 								                return llm["model_type"].strip(",")[-1]
 								def message_fit_in(msg, max_length=4000):
 								    def count():
 								        nonlocal msg
 								        tks_cnts = []
 								        for m in msg:
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								            tks_cnts.append({"role": m["role"], "count": num_tokens_from_string(m["content"])})
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        total = 0
 								        for m in tks_cnts:
 								            total += m["count"]
 								        return total
 								    c = count()
 								    if c < max_length:
 								        return c, msg
-												Fix: truncate message issue. (#5765)

### What problem does this PR solve?

Close #5761

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-07 16:33:25 +08:00
+								    msg_ = [m for m in msg if m["role"] == "system"]
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    if len(msg) > 1:
 								        msg_.append(msg[-1])
 								    msg = msg_
 								    c = count()
 								    if c < max_length:
 								        return c, msg
 								    ll = num_tokens_from_string(msg_[0]["content"])
 								    ll2 = num_tokens_from_string(msg_[-1]["content"])
 								    if ll / (ll + ll2) > 0.8:
 								        m = msg_[0]["content"]
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								        m = encoder.decode(encoder.encode(m)[: max_length - ll2])
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        msg[0]["content"] = m
 								        return max_length, msg
-												Fix: truncate message issue. (#5776)

### What problem does this PR solve?

Close #5761
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-07 17:41:56 +08:00
+								    m = msg_[-1]["content"]
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								    m = encoder.decode(encoder.encode(m)[: max_length - ll2])
-												Fix: truncate message issue. (#5776)

### What problem does this PR solve?

Close #5761
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-07 17:41:56 +08:00
+								    msg[-1]["content"] = m
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    return max_length, msg
 								def kb_prompt(kbinfos, max_tokens):
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								    from api.db.services.document_service import DocumentService
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
 								    used_token_count = 0
 								    chunks_num = 0
 								    for i, c in enumerate(knowledges):
 								        used_token_count += num_tokens_from_string(c)
 								        chunks_num += 1
 								        if max_tokens * 0.97 < used_token_count:
 								            knowledges = knowledges[:i]
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								            logging.warning(f"Not all the retrieval into prompt: {i + 1}/{len(knowledges)}")
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								            break
 								    docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
 								    docs = {d.id: d.meta_fields for d in docs}
 								    doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								    for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
-												Feat: let image citation being shown. (#7624)

### What problem does this PR solve?

#7623

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-05-13 19:30:05 +08:00
+								        cnt = f"---\nID: {i}\n" + (f"URL: {ck['url']}\n" if "url" in ck else "")
 								        cnt += ck["content_with_weight"]
 								        doc2chunks[ck["docnm_kwd"]]["chunks"].append(cnt)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
 								    knowledges = []
 								    for nm, cks_meta in doc2chunks.items():
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								        txt = f"\nDocument: {nm} \n"
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        for k, v in cks_meta["meta"].items():
 								            txt += f"{k}: {v}\n"
 								        txt += "Relevant fragments as following:\n"
 								        for i, chunk in enumerate(cks_meta["chunks"], 1):
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								            txt += f"{chunk}\n"
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        knowledges.append(txt)
 								    return knowledges
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								def citation_prompt():
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								    print("USE PROMPT", flush=True)
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								    return """
 								# Citation requirements:
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
 								- Use a uniform citation format of like [ID:i] [ID:j], where "i" and "j" are the document ID enclosed in square brackets. Separate multiple IDs with spaces (e.g., [ID:0] [ID:1]).
 								- Citation markers must be placed at the end of a sentence, separated by a space from the final punctuation (e.g., period, question mark). A maximum of 4 citations are allowed per sentence.
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								- DO NOT use standalone Document IDs (e.g., '#ID#').
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								- Citations ALWAYS in the "[ID:i]" format.
 								- STRICTLY prohibit the use of strikethrough symbols (e.g., ~~) or any other non-standard formatting syntax.
 								- Any failure to adhere to the above rules, including but not limited to incorrect formatting, use of prohibited styles, or unsupported citations, will be considered an error, and no citation will be added for that sentence.
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
 								--- Example START ---
 								<SYSTEM>: Here is the knowledge base:
 								Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
 								URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
 								ID: 0
 								The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...
 								Document: Elon Musk's Dogecoin tweet sparks social media frenzy
 								ID: 1
 								Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.
 								Document: Causal effect of Elon Musk tweets on Dogecoin price
 								ID: 2
 								If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...
 								Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
 								ID: 3
 								The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
 								      The above is the knowledge base.
 								<USER>: What's the Elon's view on dogecoin?
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency [ID:0] [ID:1].
 								Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services [ID:3].
-												Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-11 19:56:21 +08:00
+								Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
 								--- Example END ---
 								"""
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								def keyword_extraction(chat_mdl, content, topn=3):
 								    prompt = f"""
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Role: You're a text analyzer.
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								Task: extract the most important keywords/phrases of a given piece of text content.
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Requirements:
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								  - Summarize the text content, and give top {topn} important keywords/phrases.
 								  - The keywords MUST be in language of the given piece of text content.
 								  - The keywords are delimited by ENGLISH COMMA.
 								  - Keywords ONLY in output.
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								### Text Content
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								{content}
 								"""
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    _, msg = message_fit_in(msg, chat_mdl.max_length)
 								    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
 								    if isinstance(kwd, tuple):
 								        kwd = kwd[0]
-												<think> tag is missing. (#7256)

### What problem does this PR solve?

Some models force thinking, resulting in the absence of the think tag in
the returned content

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-24 11:44:10 +08:00
+								    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    if kwd.find("**ERROR**") >= 0:
 								        return ""
 								    return kwd
 								def question_proposal(chat_mdl, content, topn=3):
 								    prompt = f"""
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Role: You're a text analyzer.
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								Task:  propose {topn} questions about a given piece of text content.
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Requirements:
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								  - Understand and summarize the text content, and propose top {topn} important questions.
 								  - The questions SHOULD NOT have overlapping meanings.
 								  - The questions SHOULD cover the main content of the text as much as possible.
 								  - The questions MUST be in language of the given piece of text content.
 								  - One question per line.
 								  - Question ONLY in output.
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								### Text Content
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								{content}
 								"""
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    _, msg = message_fit_in(msg, chat_mdl.max_length)
 								    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
 								    if isinstance(kwd, tuple):
 								        kwd = kwd[0]
-												<think> tag is missing. (#7256)

### What problem does this PR solve?

Some models force thinking, resulting in the absence of the think tag in
the returned content

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-24 11:44:10 +08:00
+								    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    if kwd.find("**ERROR**") >= 0:
 								        return ""
 								    return kwd
-												Refa: make Rewrite component effective to relative data expression. (#5752)

### What problem does this PR solve?

#5716

### Type of change

- [x] Refactoring
											
										
										
											2025-03-07 13:48:13 +08:00
+								def full_question(tenant_id, llm_id, messages, language=None):
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								    from api.db.services.llm_service import LLMBundle
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    if llm_id2llm_type(llm_id) == "image2text":
 								        chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
 								    else:
 								        chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
 								    conv = []
 								    for m in messages:
 								        if m["role"] not in ["user", "assistant"]:
 								            continue
 								        conv.append("{}: {}".format(m["role"].upper(), m["content"]))
 								    conv = "\n".join(conv)
 								    today = datetime.date.today().isoformat()
 								    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).isoformat()
 								    tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
 								    prompt = f"""
 								Role: A helpful assistant
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Task and steps:
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+. Generate a full user question that would follow the conversation.
 . If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}.
 								Requirements & Restrictions:
 								  - If the user's latest question is completely, don't do anything, just return the original question.
-												Refa: make Rewrite component effective to relative data expression. (#5752)

### What problem does this PR solve?

#5716

### Type of change

- [x] Refactoring
											
										
										
											2025-03-07 13:48:13 +08:00
+								  - DON'T generate anything except a refined question."""
 								    if language:
 								        prompt += f"""
 								  - Text generated MUST be in {language}."""
 								    else:
 								        prompt += """
 								  - Text generated MUST be in the same language of the original user's question.
 								"""
 								    prompt += f"""
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
 								######################
 								-Examples-
 								######################
 								# Example 1
 								## Conversation
 								USER: What is the name of Donald Trump's father?
 								ASSISTANT:  Fred Trump.
 								USER: And his mother?
 								###############
 								Output: What's the name of Donald Trump's mother?
 								------------
 								# Example 2
 								## Conversation
 								USER: What is the name of Donald Trump's father?
 								ASSISTANT:  Fred Trump.
 								USER: And his mother?
 								ASSISTANT:  Mary Trump.
 								User: What's her full name?
 								###############
 								Output: What's the full name of Donald Trump's mother Mary Trump?
 								------------
 								# Example 3
 								## Conversation
 								USER: What's the weather today in London?
 								ASSISTANT:  Cloudy.
 								USER: What's about tomorrow in Rochester?
 								###############
 								Output: What's the weather in Rochester on {tomorrow}?
-												Refa: make Rewrite component effective to relative data expression. (#5752)

### What problem does this PR solve?

#5716

### Type of change

- [x] Refactoring
											
										
										
											2025-03-07 13:48:13 +08:00
+								######################
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								# Real Data
 								## Conversation
 								{conv}
 								###############
 								    """
 								    ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2})
-												<think> tag is missing. (#7256)

### What problem does this PR solve?

Some models force thinking, resulting in the absence of the think tag in
the returned content

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-24 11:44:10 +08:00
+								    ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
-												Feat: support cross-lang search. (#7557)

### What problem does this PR solve?

#7376
#4503
#5710 
#7470

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-05-09 15:32:02 +08:00
+								def cross_languages(tenant_id, llm_id, query, languages=[]):
 								    from api.db.services.llm_service import LLMBundle
 								    if llm_id and llm_id2llm_type(llm_id) == "image2text":
 								        chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
 								    else:
 								        chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
 								    sys_prompt = """
 								Act as a streamlined multilingual translator. Strictly output translations separated by ### without any explanations or formatting. Follow these rules:
 . Accept batch translation requests in format:
 								[source text]
 								===
 								[target languages separated by commas]
 . Always maintain:
 								- Original formatting (tables/lists/spacing)
 								- Technical terminology accuracy
 								- Cultural context appropriateness
 . Output format:
 								[language1 translation]
 								###
 								[language1 translation]
 								**Examples:**
 								Input:
 								Hello World! Let's discuss AI safety.
 								===
 								Chinese, French, Jappanese
 								Output:
 								你好世界！让我们讨论人工智能安全问题。
 								###
 								Bonjour le monde ! Parlons de la sécurité de l'IA.
 								###
 								こんにちは世界！AIの安全性について話し合いましょう。
 								"""
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								    user_prompt = f"""
-												Feat: support cross-lang search. (#7557)

### What problem does this PR solve?

#7376
#4503
#5710 
#7470

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-05-09 15:32:02 +08:00
+								Input:
 								{query}
 								===
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								{", ".join(languages)}
-												Feat: support cross-lang search. (#7557)

### What problem does this PR solve?

#7376
#4503
#5710 
#7470

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-05-09 15:32:02 +08:00
 								Output:
 								"""
 								    ans = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_prompt}], {"temperature": 0.2})
 								    ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
 								    if ans.find("**ERROR**") >= 0:
 								        return query
 								    return "\n".join([a for a in re.sub(r"(^Output:|\n+)", "", ans, flags=re.DOTALL).split("===") if a.strip()])
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
 								def content_tagging(chat_mdl, content, all_tags, examples, topn=3):
 								    prompt = f"""
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Role: You're a text analyzer.
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
 								Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set.
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
+								Steps::
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								  - Comprehend the tag/label set.
 								  - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON.
 								  - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score.
 								Requirements
 								  - The tags MUST be from the tag set.
 								  - The output MUST be in JSON format only, the key is tag and the value is its relevance score.
 								  - The relevance score must be range from 1 to 10.
 								  - Keywords ONLY in output.
 								# TAG SET
 								{", ".join(all_tags)}
 								"""
 								    for i, ex in enumerate(examples):
 								        prompt += """
 								# Examples {}
 								### Text Content
 								{}
 								Output:
 								{}
 								        """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))
 								    prompt += f"""
 								# Real Data
 								### Text Content
 								{content}
 								"""
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								    msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    _, msg = message_fit_in(msg, chat_mdl.max_length)
 								    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5})
 								    if isinstance(kwd, tuple):
 								        kwd = kwd[0]
-												<think> tag is missing. (#7256)

### What problem does this PR solve?

Some models force thinking, resulting in the absence of the think tag in
the returned content

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-24 11:44:10 +08:00
+								    kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    if kwd.find("**ERROR**") >= 0:
 								        raise Exception(kwd)
 								    try:
-												Fix: LLM generated tag issue. (#7301)

### What problem does this PR solve?
#7298

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-25 14:38:34 +08:00
+								        obj = json_repair.loads(kwd)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								    except json_repair.JSONDecodeError:
 								        try:
-												Fix: add fallback for bad citation output (#7014)

### What problem does this PR solve?

Add fallback for bad citation output. #6948

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-15 09:33:53 +08:00
+								            result = kwd.replace(prompt[:-1], "").replace("user", "").replace("model", "").strip()
 								            result = "{" + result.split("{")[1].split("}")[0] + "}"
-												Fix: LLM generated tag issue. (#7301)

### What problem does this PR solve?
#7298

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-25 14:38:34 +08:00
+								            obj = json_repair.loads(result)
-												Code refactor (#5371)

### What problem does this PR solve?

#5173

### Type of change

- [x] Refactoring
											
										
										
											2025-02-26 15:40:52 +08:00
+								        except Exception as e:
 								            logging.exception(f"JSON parsing error: {result} -> {e}")
 								            raise e
-												Fix: LLM generated tag issue. (#7301)

### What problem does this PR solve?
#7298

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-04-25 14:38:34 +08:00
+								    res = {}
 								    for k, v in obj.items():
 								        try:
 								            res[str(k)] = int(v)
 								        except Exception:
 								            pass
 								    return res
-												Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-03-18 14:52:20 +08:00
 								def vision_llm_describe_prompt(page=None) -> str:
 								    prompt_en = """
 								INSTRUCTION:
 								Transcribe the content from the provided PDF page image into clean Markdown format.
 								- Only output the content transcribed from the image.
 								- Do NOT output this instruction or any other explanation.
 								- If the content is missing or you do not understand the input, return an empty string.
 								RULES:
 . Do NOT generate examples, demonstrations, or templates.
 . Do NOT output any extra text such as 'Example', 'Example Output', or similar.
 . Do NOT generate any tables, headings, or content that is not explicitly present in the image.
 . Transcribe content word-for-word. Do NOT modify, translate, or omit any content.
 . Do NOT explain Markdown or mention that you are using Markdown.
 . Do NOT wrap the output in ```markdown or ``` blocks.
 . Only apply Markdown structure to headings, paragraphs, lists, and tables, strictly based on the layout of the image. Do NOT create tables unless an actual table exists in the image.
 . Preserve the original language, information, and order exactly as shown in the image.
 								"""
 								    if page is not None:
 								        prompt_en += f"\nAt the end of the transcription, add the page divider: `--- Page {page} ---`."
 								    prompt_en += """
 								FAILURE HANDLING:
 								- If you do not detect valid content in the image, return an empty string.
 								"""
 								    return prompt_en
-												Feat: add VLM-boosted PDF parser (#6278)

### What problem does this PR solve?

Add VLM-boosted PDF parser if VLM is set.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-20 09:39:32 +08:00
 								def vision_llm_figure_describe_prompt() -> str:
 								    prompt = """
 								You are an expert visual data analyst. Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
 								Tasks:
 . Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
 . Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
 . Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
 . Analyze and explain any trends, comparisons, or patterns shown in the data.
 . Capture any annotations, captions, or footnotes, and explain their relevance to the image.
 . Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.
 								Output format (include only sections relevant to the image content):
 								- Visual Type: [Type]
 								- Title: [Title text, if available]
 								- Axes / Legends / Labels: [Details, if available]
 								- Data Points: [Extracted data]
 								- Trends / Insights: [Analysis and interpretation]
 								- Captions / Annotations: [Text and relevance, if available]
 								Ensure high accuracy, clarity, and completeness in your analysis, and includes only the information present in the image. Avoid unnecessary statements about missing elements.
 								"""
 								    return prompt