From 0283e4098f842aa6fbd2d6ccb372af2c596449fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Lukas?= <gl@gl.co.at>
Date: Fri, 10 Oct 2025 13:18:24 +0200
Subject: [PATCH] Fix #10408 (#10471)

### What problem does this PR solve?

Google Cloud model does not work correctly with gemini-2.5 models
Close #10408

### Type of change

- [X] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
---
 api/db/db_models.py   |  6 +++++-
 rag/llm/chat_model.py | 20 +++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/api/db/db_models.py b/api/db/db_models.py
index 7f2e35497..c1a5fd5ed 100644
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@@ -641,7 +641,7 @@ class TenantLLM(DataBaseModel):
     llm_factory = CharField(max_length=128, null=False, help_text="LLM factory name", index=True)
     model_type = CharField(max_length=128, null=True, help_text="LLM, Text Embedding, Image2Text, ASR", index=True)
     llm_name = CharField(max_length=128, null=True, help_text="LLM name", default="", index=True)
-    api_key = CharField(max_length=2048, null=True, help_text="API KEY", index=True)
+    api_key = TextField(null=True, help_text="API KEY")
     api_base = CharField(max_length=255, null=True, help_text="API Base")
     max_tokens = IntegerField(default=8192, index=True)
     used_tokens = IntegerField(default=0, index=True)
@@ -1142,4 +1142,8 @@ def migrate_db():
         migrate(migrator.add_column("knowledgebase", "mindmap_task_finish_at", CharField(null=True)))
     except Exception:
         pass
+    try:
+        migrate(migrator.alter_column_type("tenant_llm", "api_key", TextField(null=True, help_text="API KEY")))
+    except Exception:
+        pass
     logging.disable(logging.NOTSET)
diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py
index 2a77a45eb..91507d147 100644
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@@ -145,7 +145,7 @@ class Base(ABC):
             logging.info(f"[INFO] {self.model_name} detected as reasoning model, using _chat_streamly")
 
             final_ans = ""
-            tol_token = 0 
+            tol_token = 0
             for delta, tol in self._chat_streamly(history, gen_conf, with_reasoning=False, **kwargs):
                 if delta.startswith("<think>") or delta.endswith("</think>"):
                     continue
@@ -156,7 +156,7 @@ class Base(ABC):
                 final_ans = "**ERROR**: Empty response from reasoning model"
 
             return final_ans.strip(), tol_token
-        
+
         if self.model_name.lower().find("qwen3") >= 0:
             kwargs["extra_body"] = {"enable_thinking": False}
 
@@ -1182,6 +1182,7 @@ class GoogleChat(Base):
         else:
             if "max_tokens" in gen_conf:
                 gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
+                del gen_conf["max_tokens"]
             for k in list(gen_conf.keys()):
                 if k not in ["temperature", "top_p", "max_output_tokens"]:
                     del gen_conf[k]
@@ -1189,6 +1190,7 @@ class GoogleChat(Base):
 
     def _chat(self, history, gen_conf={}, **kwargs):
         system = history[0]["content"] if history and history[0]["role"] == "system" else ""
+        gen_conf = self._clean_conf(gen_conf)
         if "claude" in self.model_name:
             response = self.client.messages.create(
                 model=self.model_name,
@@ -1250,9 +1252,12 @@ class GoogleChat(Base):
 
             yield total_tokens
         else:
+            response = None
+            total_tokens = 0
             self.client._system_instruction = system
             if "max_tokens" in gen_conf:
                 gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
+                del gen_conf["max_tokens"]
             for k in list(gen_conf.keys()):
                 if k not in ["temperature", "top_p", "max_output_tokens"]:
                     del gen_conf[k]
@@ -1260,18 +1265,23 @@ class GoogleChat(Base):
                 if "role" in item and item["role"] == "assistant":
                     item["role"] = "model"
                 if "content" in item:
-                    item["parts"] = item.pop("content")
+                    item["parts"] = [
+                        {
+                            "text": item.pop("content"),
+                        }
+                    ]
             ans = ""
             try:
-                response = self.model.generate_content(history, generation_config=gen_conf, stream=True)
+                response = self.client.generate_content(history, generation_config=gen_conf, stream=True)
                 for resp in response:
                     ans = resp.text
+                    total_tokens += num_tokens_from_string(ans)
                     yield ans
 
             except Exception as e:
                 yield ans + "\n**ERROR**: " + str(e)
 
-            yield response._chunks[-1].usage_metadata.total_token_count
+            yield total_tokens
 
 
 class GPUStackChat(Base):