Feat/local endpoint llm (#148)

* serve local model in a different process from the app --------- Co-authored-by: albert <albert@cinnamon.is> Co-authored-by: trducng <trungduc1992@gmail.com>
2025-06-26 23:19:56 +00:00 · 2024-03-15 16:17:33 +07:00 · 2024-03-15 16:17:33 +07:00 · df12dec732
commit df12dec732
parent 2950e6ed02
20 changed files with 675 additions and 79 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+*.bat   text eol=crlf
--- a/.gitignore
+++ b/.gitignore
@ -466,4 +466,5 @@ examples/example1/assets
 storage/*

 # Conda and env storages
-install_dir/
+*install_dir/
+doc_env
--- a/libs/kotaemon/kotaemon/agents/io/base.py
+++ b/libs/kotaemon/kotaemon/agents/io/base.py
@ -5,7 +5,7 @@ from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Literal, NamedTuple, Optional, Union

-from pydantic import Extra
+from pydantic import ConfigDict

 from kotaemon.base import LLMInterface

@ -238,7 +238,7 @@ class AgentFinish(NamedTuple):
    log: str


-class AgentOutput(LLMInterface, extra=Extra.allow):  # type: ignore [call-arg]
+class AgentOutput(LLMInterface):
    """Output from an agent.

    Args:
@ -248,6 +248,8 @@ class AgentOutput(LLMInterface, extra=Extra.allow):  # type: ignore [call-arg]
        error: The error message if any.
    """

+    model_config = ConfigDict(extra="allow")
+
    text: str
    type: str = "agent"
    agent_type: AgentType
--- a/libs/kotaemon/kotaemon/embeddings/init.py
+++ b/libs/kotaemon/kotaemon/embeddings/init.py
@ -1,4 +1,5 @@
 from .base import BaseEmbeddings
+from .endpoint_based import EndpointEmbeddings
 from .langchain_based import (
    AzureOpenAIEmbeddings,
    CohereEmbdeddings,
@ -8,6 +9,7 @@ from .langchain_based import (

 __all__ = [
    "BaseEmbeddings",
+    "EndpointEmbeddings",
    "OpenAIEmbeddings",
    "AzureOpenAIEmbeddings",
    "CohereEmbdeddings",
--- a/libs/kotaemon/kotaemon/embeddings/endpoint_based.py
+++ b/libs/kotaemon/kotaemon/embeddings/endpoint_based.py
@ -0,0 +1,46 @@
+import requests
+
+from kotaemon.base import Document, DocumentWithEmbedding
+
+from .base import BaseEmbeddings
+
+
+class EndpointEmbeddings(BaseEmbeddings):
+    """
+    An Embeddings component that uses an OpenAI API compatible endpoint.
+
+    Attributes:
+        endpoint_url (str): The url of an OpenAI API compatible endpoint.
+    """
+
+    endpoint_url: str
+
+    def run(
+        self, text: str | list[str] | Document | list[Document]
+    ) -> list[DocumentWithEmbedding]:
+        """
+        Generate embeddings from text Args:
+            text (str | list[str] | Document | list[Document]): text to generate
+            embeddings from
+        Returns:
+            list[DocumentWithEmbedding]: embeddings
+        """
+        if not isinstance(text, list):
+            text = [text]
+
+        outputs = []
+
+        for item in text:
+            response = requests.post(
+                self.endpoint_url, json={"input": str(item)}
+            ).json()
+            outputs.append(
+                DocumentWithEmbedding(
+                    text=str(item),
+                    embedding=response["data"][0]["embedding"],
+                    total_tokens=response["usage"]["total_tokens"],
+                    prompt_tokens=response["usage"]["prompt_tokens"],
+                )
+            )
+
+        return outputs
--- a/libs/kotaemon/kotaemon/indices/qa/citation.py
+++ b/libs/kotaemon/kotaemon/indices/qa/citation.py
@ -108,6 +108,9 @@ class CitationPipeline(BaseComponent):
            print(e)
            return None

+        if not llm_output.messages:
+            return None
+
        function_output = llm_output.messages[0].additional_kwargs["function_call"][
            "arguments"
        ]
@ -126,6 +129,9 @@ class CitationPipeline(BaseComponent):
            print(e)
            return None

+        if not llm_output.messages:
+            return None
+
        function_output = llm_output.messages[0].additional_kwargs["function_call"][
            "arguments"
        ]
--- a/libs/kotaemon/kotaemon/llms/init.py
+++ b/libs/kotaemon/kotaemon/llms/init.py
@ -2,7 +2,7 @@ from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMes

 from .base import BaseLLM
 from .branching import GatedBranchingPipeline, SimpleBranchingPipeline
-from .chats import AzureChatOpenAI, ChatLLM, LlamaCppChat
+from .chats import AzureChatOpenAI, ChatLLM, EndpointChatLLM, LlamaCppChat
 from .completions import LLM, AzureOpenAI, LlamaCpp, OpenAI
 from .cot import ManualSequentialChainOfThought, Thought
 from .linear import GatedLinearPipeline, SimpleLinearPipeline
@ -12,6 +12,7 @@ __all__ = [
    "BaseLLM",
    # chat-specific components
    "ChatLLM",
+    "EndpointChatLLM",
    "BaseMessage",
    "HumanMessage",
    "AIMessage",
--- a/libs/kotaemon/kotaemon/llms/chats/init.py
+++ b/libs/kotaemon/kotaemon/llms/chats/init.py
@ -1,5 +1,12 @@
 from .base import ChatLLM
+from .endpoint_based import EndpointChatLLM
 from .langchain_based import AzureChatOpenAI, LCChatMixin
 from .llamacpp import LlamaCppChat

-__all__ = ["ChatLLM", "AzureChatOpenAI", "LCChatMixin", "LlamaCppChat"]
+__all__ = [
+    "ChatLLM",
+    "EndpointChatLLM",
+    "AzureChatOpenAI",
+    "LCChatMixin",
+    "LlamaCppChat",
+]
--- a/libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
+++ b/libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
@ -0,0 +1,85 @@
+import requests
+
+from kotaemon.base import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    LLMInterface,
+    SystemMessage,
+)
+
+from .base import ChatLLM
+
+
+class EndpointChatLLM(ChatLLM):
+    """
+    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API
+    compatible endpoint.
+
+    Attributes:
+        endpoint_url (str): The url of a OpenAI API compatible endpoint.
+    """
+
+    endpoint_url: str
+
+    def run(
+        self, messages: str | BaseMessage | list[BaseMessage], **kwargs
+    ) -> LLMInterface:
+        """
+        Generate response from messages
+        Args:
+            messages (str | BaseMessage | list[BaseMessage]): history of messages to
+                generate response from
+            **kwargs: additional arguments to pass to the OpenAI API
+        Returns:
+            LLMInterface: generated response
+        """
+        if isinstance(messages, str):
+            input_ = [HumanMessage(content=messages)]
+        elif isinstance(messages, BaseMessage):
+            input_ = [messages]
+        else:
+            input_ = messages
+
+        def decide_role(message: BaseMessage):
+            if isinstance(message, SystemMessage):
+                return "system"
+            elif isinstance(message, AIMessage):
+                return "assistant"
+            else:
+                return "user"
+
+        request_json = {
+            "messages": [{"content": m.text, "role": decide_role(m)} for m in input_]
+        }
+
+        response = requests.post(self.endpoint_url, json=request_json).json()
+
+        content = ""
+        candidates = []
+        if response["choices"]:
+            candidates = [
+                each["message"]["content"]
+                for each in response["choices"]
+                if each["message"]["content"]
+            ]
+            content = candidates[0]
+
+        return LLMInterface(
+            content=content,
+            candidates=candidates,
+            completion_tokens=response["usage"]["completion_tokens"],
+            total_tokens=response["usage"]["total_tokens"],
+            prompt_tokens=response["usage"]["prompt_tokens"],
+        )
+
+    def invoke(
+        self, messages: str | BaseMessage | list[BaseMessage], **kwargs
+    ) -> LLMInterface:
+        """Same as run"""
+        return self.run(messages, **kwargs)
+
+    async def ainvoke(
+        self, messages: str | BaseMessage | list[BaseMessage], **kwargs
+    ) -> LLMInterface:
+        return self.invoke(messages, **kwargs)
--- a/libs/ktem/flowsettings.py
+++ b/libs/ktem/flowsettings.py
@ -12,7 +12,7 @@ user_cache_dir.mkdir(parents=True, exist_ok=True)

 COHERE_API_KEY = config("COHERE_API_KEY", default="")
 KH_MODE = "dev"
-KH_FEATURE_USER_MANAGEMENT = True
+KH_FEATURE_USER_MANAGEMENT = False
 KH_FEATURE_USER_MANAGEMENT_ADMIN = str(
    config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin")
 )
@ -21,6 +21,8 @@ KH_FEATURE_USER_MANAGEMENT_PASSWORD = str(
 )
 KH_ENABLE_ALEMBIC = False
 KH_DATABASE = f"sqlite:///{user_cache_dir / 'sql.db'}"
+KH_FILESTORAGE_PATH = str(user_cache_dir / "files")
+
 KH_DOCSTORE = {
    "__type__": "kotaemon.storages.SimpleFileDocumentStore",
    "path": str(user_cache_dir / "docstore"),
@ -29,51 +31,68 @@ KH_VECTORSTORE = {
    "__type__": "kotaemon.storages.ChromaVectorStore",
    "path": str(user_cache_dir / "vectorstore"),
 }
-KH_FILESTORAGE_PATH = str(user_cache_dir / "files")
 KH_LLMS = {
-    "gpt4": {
+    # example for using Azure OpenAI, the config variables can set as environment
+    # variables or in the .env file
+    # "gpt4": {
+    #     "def": {
+    #         "__type__": "kotaemon.llms.AzureChatOpenAI",
+    #         "temperature": 0,
+    #         "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
+    #         "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
+    #         "openai_api_version": config("OPENAI_API_VERSION", default=""),
+    #         "deployment_name": "<your deployment name>",
+    #         "stream": True,
+    #     },
+    #     "accuracy": 10,
+    #     "cost": 10,
+    #     "default": False,
+    # },
+    # "gpt35": {
+    #     "def": {
+    #         "__type__": "kotaemon.llms.AzureChatOpenAI",
+    #         "temperature": 0,
+    #         "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
+    #         "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
+    #         "openai_api_version": config("OPENAI_API_VERSION", default=""),
+    #         "deployment_name": "<your deployment name>",
+    #         "request_timeout": 10,
+    #         "stream": False,
+    #     },
+    #     "accuracy": 5,
+    #     "cost": 5,
+    #     "default": False,
+    # },
+    "local": {
        "def": {
-            "__type__": "kotaemon.llms.AzureChatOpenAI",
-            "temperature": 0,
-            "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
-            "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
-            "openai_api_version": config("OPENAI_API_VERSION", default=""),
-            "deployment_name": "dummy-q2",
-            "stream": True,
+            "__type__": "kotaemon.llms.EndpointChatLLM",
+            "endpoint_url": "http://localhost:31415/v1/chat/completions",
        },
-        "accuracy": 10,
-        "cost": 10,
        "default": False,
    },
-    "gpt35": {
-        "def": {
-            "__type__": "kotaemon.llms.AzureChatOpenAI",
-            "temperature": 0,
-            "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
-            "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
-            "openai_api_version": config("OPENAI_API_VERSION", default=""),
-            "deployment_name": "dummy-q2",
-            "request_timeout": 10,
-            "stream": False,
-        },
-        "accuracy": 5,
-        "cost": 5,
-        "default": True,
-    },
 }
 KH_EMBEDDINGS = {
-    "ada": {
+    # example for using Azure OpenAI, the config variables can set as environment
+    # variables or in the .env file
+    # "ada": {
+    #     "def": {
+    #         "__type__": "kotaemon.embeddings.AzureOpenAIEmbeddings",
+    #         "model": "text-embedding-ada-002",
+    #         "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
+    #         "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
+    #         "deployment": "<your deployment name>",
+    #         "chunk_size": 16,
+    #     },
+    #     "accuracy": 5,
+    #     "cost": 5,
+    #     "default": True,
+    # },
+    "local": {
        "def": {
-            "__type__": "kotaemon.embeddings.AzureOpenAIEmbeddings",
-            "model": "text-embedding-ada-002",
-            "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""),
-            "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""),
-            "deployment": "dummy-q2-text-embedding",
-            "chunk_size": 16,
+            "__type__": "kotaemon.embeddings.EndpointEmbeddings",
+            "endpoint_url": "http://localhost:31415/v1/embeddings",
        },
-        "accuracy": 5,
-        "cost": 5,
-        "default": True,
+        "default": False,
    },
 }
 KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"]
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@ -118,7 +118,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):

        # rerank
        docs = self.vector_retrieval(text=text, top_k=top_k, **kwargs)
-        if self.get_from_path("reranker"):
+        if docs and self.get_from_path("reranker"):
            docs = self.reranker(docs, query=text)

        if not self.get_extra_table:
--- a/libs/ktem/ktem/reasoning/simple.py
+++ b/libs/ktem/ktem/reasoning/simple.py
@ -200,24 +200,37 @@ class AnswerWithContextPipeline(BaseComponent):
            lang=self.lang,
        )

-        citation_task = asyncio.create_task(
-            self.citation_pipeline.ainvoke(context=evidence, question=question)
-        )
-        print("Citation task created")
+        if evidence:
+            citation_task = asyncio.create_task(
+                self.citation_pipeline.ainvoke(context=evidence, question=question)
+            )
+            print("Citation task created")

        messages = []
        if self.system_prompt:
            messages.append(SystemMessage(content=self.system_prompt))
        messages.append(HumanMessage(content=prompt))
+
        output = ""
-        for text in self.llm.stream(messages):
-            output += text.text
-            self.report_output({"output": text.text})
-            await asyncio.sleep(0)
+        try:
+            # try streaming first
+            print("Trying LLM streaming")
+            for text in self.llm.stream(messages):
+                output += text.text
+                self.report_output({"output": text.text})
+                await asyncio.sleep(0)
+        except NotImplementedError:
+            print("Streaming is not supported, falling back to normal processing")
+            output = self.llm(messages).text
+            self.report_output({"output": output})

        # retrieve the citation
        print("Waiting for citation task")
-        citation = await citation_task
+        if evidence:
+            citation = await citation_task
+        else:
+            citation = None
+
        answer = Document(text=output, metadata={"citation": citation})

        return answer
--- a/libs/ktem/launch.py
+++ b/libs/ktem/launch.py
@ -2,4 +2,4 @@ from ktem.main import App

 app = App()
 demo = app.make()
-demo.queue().launch(favicon_path=app._favicon)
+demo.queue().launch(favicon_path=app._favicon, inbrowser=True)
--- a/scripts/run_linux.sh
+++ b/scripts/run_linux.sh
@ -12,23 +12,23 @@ function install_miniconda() {
    # Miniconda installer is limited to two main architectures: x86_64 and arm64
    local sys_arch=$(uname -m)
    case "${sys_arch}" in
-        x86_64*) sys_arch="x86_64";;
-        arm64*) sys_arch="aarch64";;
-        aarch64*) sys_arch="aarch64";;
-        *) {
-            echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64"
-            exit 1
-        };;
+    x86_64*) sys_arch="x86_64" ;;
+    arm64*) sys_arch="aarch64" ;;
+    aarch64*) sys_arch="aarch64" ;;
+    *) {
+        echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64"
+        exit 1
+    } ;;
    esac

    # if miniconda has not been installed, download and install it
-    if ! "${conda_root}/bin/conda" --version &>/dev/null ; then
+    if ! "${conda_root}/bin/conda" --version &>/dev/null; then
        if [ ! -d "$install_dir/miniconda_installer.sh" ]; then
            echo "Downloading Miniconda from $miniconda_url"
            local miniconda_url="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${sys_arch}.sh"

            mkdir -p "$install_dir"
-            curl -Lk "$miniconda_url" > "$install_dir/miniconda_installer.sh"
+            curl -Lk "$miniconda_url" >"$install_dir/miniconda_installer.sh"
        fi

        echo "Installing Miniconda to $conda_root"
@ -64,7 +64,7 @@ function create_conda_env() {

 function activate_conda_env() {
    # deactivate the current env(s) to avoid conflicts
-    { conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
+    { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null

    # check if conda env is broken (because of interruption during creation)
    if [ ! -f "$env_dir/bin/python" ]; then
@ -80,7 +80,7 @@ function activate_conda_env() {
    echo "Activate conda environment at $CONDA_PREFIX"
 }

-function deactivate_conda_env(){
+function deactivate_conda_env() {
    # Conda deactivate if we are in the right env
    if [ "$CONDA_PREFIX" == "$env_dir" ]; then
        conda deactivate
@ -89,7 +89,7 @@ function deactivate_conda_env(){
 }

 function install_dependencies() {
-    if pip list 2> /dev/null | grep -q "kotaemon"; then
+    if pip list 2>/dev/null | grep -q "kotaemon"; then
        echo "Requirements are already installed"
    else
        local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
@ -101,7 +101,7 @@ function install_dependencies() {
        echo "" && echo "Install ktem's requirements"
        python -m pip install -e "$ktem_root"

-        if ! pip list 2> /dev/null | grep -q "kotaemon"; then
+        if ! pip list 2>/dev/null | grep -q "kotaemon"; then
            echo "Installation failed. You may need to run the installer again."
            deactivate_conda_env
            exit 1
@ -123,6 +123,10 @@ function install_dependencies() {
    fi
 }

+function setup_local_model() {
+    python $(pwd)/scripts/serve_local.py
+}
+
 function launch_ui() {
    gradio $(pwd)/libs/ktem/launch.py || {
        echo "" && echo "Will exit now..."
@ -159,6 +163,9 @@ activate_conda_env
 print_highlight "Install requirements"
 install_dependencies

+print_highlight "Setting up a local model"
+setup_local_model
+
 print_highlight "Launching web UI. Please wait..."
 launch_ui

--- a/scripts/run_macos.sh
+++ b/scripts/run_macos.sh
@ -12,22 +12,22 @@ function install_miniconda() {
    # Miniconda installer is limited to two main architectures: x86_64 and arm64
    local sys_arch=$(uname -m)
    case "${sys_arch}" in
-        x86_64*) sys_arch="x86_64";;
-        arm64*) sys_arch="arm64";;
-        *) {
-            echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64"
-            exit 1
-        };;
+    x86_64*) sys_arch="x86_64" ;;
+    arm64*) sys_arch="arm64" ;;
+    *) {
+        echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64"
+        exit 1
+    } ;;
    esac

    # if miniconda has not been installed, download and install it
-    if ! "${conda_root}/bin/conda" --version &>/dev/null ; then
+    if ! "${conda_root}/bin/conda" --version &>/dev/null; then
        if [ ! -d "$install_dir/miniconda_installer.sh" ]; then
            local miniconda_url="https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-${sys_arch}.sh"
            echo "Downloading Miniconda from $miniconda_url"

            mkdir -p "$install_dir"
-            curl -Lk "$miniconda_url" > "$install_dir/miniconda_installer.sh"
+            curl -Lk "$miniconda_url" >"$install_dir/miniconda_installer.sh"
        fi

        echo "Installing Miniconda to $conda_root"
@ -63,7 +63,7 @@ function create_conda_env() {

 function activate_conda_env() {
    # deactivate the current env(s) to avoid conflicts
-    { conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
+    { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null

    # check if conda env is broken (because of interruption during creation)
    if [ ! -f "$env_dir/bin/python" ]; then
@ -79,7 +79,7 @@ function activate_conda_env() {
    echo "Activate conda environment at $CONDA_PREFIX"
 }

-function deactivate_conda_env(){
+function deactivate_conda_env() {
    # Conda deactivate if we are in the right env
    if [[ "$CONDA_PREFIX" == "$env_dir" ]]; then
        conda deactivate
@ -89,7 +89,7 @@ function deactivate_conda_env(){

 function install_dependencies() {
    # check if the env is already setup by finding 'kotaemon' in 'pip list'
-    if pip list 2> /dev/null | grep -q "kotaemon"; then
+    if pip list 2>/dev/null | grep -q "kotaemon"; then
        echo "Requirements are already installed"
    else
        local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
@ -101,7 +101,7 @@ function install_dependencies() {
        echo "" && echo "Install ktem's requirements"
        python -m pip install -e "$ktem_root"

-        if ! pip list 2> /dev/null | grep -q "kotaemon"; then
+        if ! pip list 2>/dev/null | grep -q "kotaemon"; then
            echo "Installation failed. You may need to run the installer again."
            deactivate_conda_env
            exit 1
@ -124,6 +124,10 @@ function install_dependencies() {
    fi
 }

+function setup_local_model() {
+    python $(pwd)/scripts/serve_local.py
+}
+
 function launch_ui() {
    gradio $(pwd)/libs/ktem/launch.py || {
        echo "" && echo "Will exit now..."
@ -141,7 +145,10 @@ function print_highlight() {
 # Main script execution

 # move two levels up from the dir where this script resides
-cd "$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" && cd ..
+cd "$(
+    cd -- "$(dirname "$0")" >/dev/null 2>&1
+    pwd -P
+)" && cd ..

 install_dir="$(pwd)/install_dir"
 conda_root="${install_dir}/conda"
@ -160,6 +167,9 @@ activate_conda_env
 print_highlight "Install requirements"
 install_dependencies

+print_highlight "Setting up a local model"
+setup_local_model
+
 print_highlight "Launching web UI. Please wait..."
 launch_ui

--- a/scripts/run_windows.bat
+++ b/scripts/run_windows.bat
@ -14,6 +14,7 @@ IF %ERRORLEVEL% EQU 0 (
    ECHO The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later.
    GOTO :end
 )
+
 CALL :print_highlight "Setup Anaconda/Miniconda"
 CALL :download_and_install_miniconda
 :: check if function run fail, then exit the script
@ -30,6 +31,10 @@ CALL :print_highlight "Install requirements"
 CALL :install_dependencies
 IF ERRORLEVEL 1 GOTO :end

+CALL :print_highlight "Setting up a local model"
+CALL :setup_local_model
+IF ERRORLEVEL 1 GOTO :end
+
 CALL :print_highlight "Launching web UI. Please wait..."
 CALL :launch_ui

@ -126,6 +131,10 @@ IF %ERRORLEVEL% == 0  (
 )
 GOTO :eof

+:setup_local_model
+python "%CD%\scripts\serve_local.py"
+GOTO :eof
+
 :launch_ui
 CALL gradio "%CD%\libs\ktem\launch.py" || ( ECHO. && ECHO Will exit now... && GOTO :exit_func_with_error )
 GOTO :eof
--- a/scripts/serve_local.py
+++ b/scripts/serve_local.py
@ -0,0 +1,81 @@
+import platform
+import subprocess
+from inspect import currentframe, getframeinfo
+from pathlib import Path
+
+import dotenv
+
+configs = dotenv.dotenv_values(".env")
+
+system_name = platform.system()
+
+cur_frame = currentframe()
+if cur_frame is None:
+    raise ValueError("Cannot get the current frame.")
+this_file = getframeinfo(cur_frame).filename
+this_dir = Path(this_file).parent
+
+
+def serve_llamacpp_python(local_model_file: Path, **kwargs):
+    def guess_chat_format(local_model_file):
+        model_name = local_model_file.stem
+
+        # handle known cases that the server backends handle incorrectly
+        # this is highly heuristic, should be expand later
+        # server backends usually has logic for this but they could still be wrong
+        if "qwen" in model_name:
+            return "qwen"
+
+        return None
+
+    # default port
+    if "port" not in kwargs:
+        kwargs["port"] = 31415
+
+    chat_format = guess_chat_format(local_model_file)
+    if chat_format:
+        kwargs = {**kwargs, "chat_format": chat_format}
+
+    # these scripts create a separate conda env and run the server
+    if system_name == "Windows":
+        script_file = this_dir / "server_llamacpp_windows.bat"
+    elif system_name == "Linux":
+        script_file = this_dir / "server_llamacpp_linux.sh"
+    elif system_name == "Darwin":
+        script_file = this_dir / "server_llamacpp_macos.sh"
+    else:
+        raise ValueError(f"Unsupported system: {system_name}")
+
+    args = " ".join(f"--{k} {v}" for k, v in kwargs.items())
+
+    cmd = f"{script_file} --model {local_model_file} {args}"
+    subprocess.Popen(cmd, shell=True)
+
+
+def main():
+    local_model_file = configs.get("LOCAL_MODEL", "")
+
+    if not local_model_file:
+        print("LOCAL_MODEL not set in the `.env` file.")
+        return
+
+    local_model_file = Path(local_model_file)
+    if not local_model_file.exists():
+        print(f"Local model not found: {local_model_file}")
+        return
+
+    print(f"Local model found: {local_model_file}")
+    will_start_server = input("Do you want to use this local model ? (y/n): ")
+
+    if will_start_server.lower().strip() not in ["y", "yes"]:
+        return
+
+    print("Starting the local server...")
+    if local_model_file.suffix == ".gguf":
+        serve_llamacpp_python(local_model_file)
+    else:
+        raise ValueError(f"Unsupported model file type: {local_model_file.suffix}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/server_llamacpp_linux.sh
+++ b/scripts/server_llamacpp_linux.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+# functions used in the main code execution
+function print_highlight() {
+    local message="${1}"
+    echo "" && echo "******************************************************"
+    echo $message
+    echo "******************************************************" && echo ""
+}
+
+function path_sanity_check() {
+    echo "Path sanity checking"
+    if [[ $PWD =~ \  ]]; then
+        print_highlight "This script relies on Miniconda which can't be silently installed under a path with spaces. Please run it from a path without spaces."
+        exit 1
+    fi
+}
+
+function deactivate_environment() {
+    echo "Deactivate existing environment(s)"
+    # deactivate existing conda envs as needed to avoid conflicts
+    { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null
+}
+
+function check_conda_existence() {
+    echo "Check for conda existence"
+    conda_exists="F"
+
+    # figure out whether conda exists
+    if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi
+
+    # verify if conda is installed by the main app, if not then raise error
+    if [ "$conda_exists" == "F" ]; then
+        # test the conda binary
+        print_highlight "conda is not installed, seems like the app wasn't installed correctly."
+        exit
+    fi
+}
+
+function create_conda_environment() {
+    # create the environment if needed
+    if [ ! -e "$INSTALL_ENV_DIR" ]; then
+        echo "Create conda environment"
+        "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python="$PYTHON_VERSION" || {
+            echo && print_highlight "Conda environment creation failed." && exit 1
+        }
+    fi
+
+    # check if conda environment was actually created
+    if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
+        print_highlight "Conda environment was not correctly created."
+        exit 1
+    fi
+}
+
+function isolate_environment() {
+    echo "Isolate environment"
+    export PYTHONNOUSERSITE=1
+    unset PYTHONPATH
+    unset PYTHONHOME
+}
+
+function activate_environment() {
+    echo "Activate conda environment"
+    source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)
+    conda activate "$INSTALL_ENV_DIR"
+}
+
+# main code execution
+
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+echo "Changed the current directory to: $(pwd)"
+
+path_sanity_check
+deactivate_environment
+
+# config
+ENV_NAME="llama-cpp-python-server"
+PYTHON_VERSION="3.10"
+CONDA_ROOT_PREFIX="$(pwd)/install_dir/conda"
+INSTALL_ENV_DIR="$(pwd)/install_dir/server_envs/${ENV_NAME}"
+
+check_conda_existence
+create_conda_environment
+isolate_environment
+activate_environment
+
+# install dependencies
+# ver 0.2.56 produces segment error for /embeddings on MacOS
+python -m pip install llama-cpp-python[server]!=0.2.56
+
+# start the server with passed params
+python -m llama_cpp.server $@
+
+conda deactivate
--- a/scripts/server_llamacpp_macos.sh
+++ b/scripts/server_llamacpp_macos.sh
@ -0,0 +1,96 @@
+#!/bin/bash
+
+# functions used in the main code execution
+function print_highlight() {
+    local message="${1}"
+    echo "" && echo "******************************************************"
+    echo $message
+    echo "******************************************************" && echo ""
+}
+
+function path_sanity_check() {
+    echo "Path sanity checking"
+    if [[ "$(pwd)" =~ " " ]]; then
+        print_highlight "This script relies on Miniconda which can't be silently installed under a path with spaces. Please run it from a path without spaces."
+        exit 1
+    fi
+}
+
+function deactivate_environment() {
+    echo "Deactivate existing environment(s)"
+    # deactivate existing conda envs as needed to avoid conflicts
+    { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null
+}
+
+function check_conda_existence() {
+    echo "Check for conda existence"
+    conda_exists="F"
+
+    # figure out whether conda exists
+    if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi
+
+    # verify if conda is installed by the main app, if not then raise error
+    if [ "$conda_exists" == "F" ]; then
+        # test the conda binary
+        print_highlight "conda is not installed, seems like the app wasn't installed correctly."
+        exit
+    fi
+}
+
+function create_conda_environment() {
+    # create the environment if needed
+    if [ ! -d "${INSTALL_ENV_DIR}" ]; then
+        echo "Create conda environment"
+        "${CONDA_ROOT_PREFIX}/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python="$PYTHON_VERSION" || (echo && print_highlight "Conda environment creation failed." && exit 1)
+    fi
+
+    # check if conda environment was actually created
+    if [ ! -f "$INSTALL_ENV_DIR/bin/python" ]; then
+        print_highlight "Conda environment was not correctly created."
+        exit 1
+    fi
+}
+
+function isolate_environment() {
+    echo "Isolate environment"
+    export PYTHONNOUSERSITE=1
+    unset PYTHONPATH
+    unset PYTHONHOME
+}
+
+function activate_environment() {
+    echo "Activate conda environment"
+    source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)
+    conda activate "$INSTALL_ENV_DIR"
+}
+
+# main code execution
+
+cd "$(
+    cd -- "$(dirname "$0")" >/dev/null 2>&1
+    pwd -P
+)" && cd ..
+echo "Changed the current directory to: $(pwd)"
+
+path_sanity_check
+deactivate_environment
+
+# config
+ENV_NAME="llama-cpp-python-server"
+PYTHON_VERSION="3.10"
+CONDA_ROOT_PREFIX="$(pwd)/install_dir/conda"
+INSTALL_ENV_DIR="$(pwd)/install_dir/server_envs/${ENV_NAME}"
+
+check_conda_existence
+create_conda_environment
+isolate_environment
+activate_environment
+
+# install dependencies
+# ver 0.2.56 produces segment error for /embeddings on MacOS
+python -m pip install llama-cpp-python[server]!=0.2.56
+
+# start the server with passed params
+python -m llama_cpp.server $@
+
+conda deactivate
--- a/scripts/server_llamacpp_windows.bat
+++ b/scripts/server_llamacpp_windows.bat
@ -0,0 +1,115 @@
+@echo off
+
+@rem main code execution
+
+call :print_highlight "Starting inference server for llama-cpp"
+
+cd /D "%~dp0\.."
+echo "Change the current directory to: %cd%"
+
+call :path_sanity_check
+call :deactivate_environment
+
+@rem config
+set ENV_NAME=llama-cpp-python-server
+set PYTHON_VERSION=3.10
+set CONDA_ROOT_PREFIX=%cd%\install_dir\conda
+set INSTALL_ENV_DIR=%cd%\install_dir\server_envs\%ENV_NAME%
+
+echo "Python version: %PYTHON_VERSION%"
+echo "Conda prefix: %CONDA_ROOT_PREFIX%"
+echo "Environment path: %INSTALL_ENV_DIR%"
+
+@rem handle conda environment
+call :check_conda_existence
+call :create_conda_environment
+call :isolate_environment
+call :activate_environment
+
+@rem install dependencies
+@rem ver 0.2.56 produces segment error for /embeddings on MacOS
+call python -m pip install llama-cpp-python[server]!=0.2.56
+
+@REM @rem start the server with passed params
+call python -m llama_cpp.server %*
+call conda deactivate
+
+goto :end
+@rem the end of main code execution
+
+
+@rem below are the functions used in the above execution
+
+
+:print_highlight
+echo.
+echo ******************************************************
+echo %~1
+echo ******************************************************
+echo.
+goto :eof
+
+
+:path_sanity_check
+echo "Path sanity checking"
+echo "%cd%"| findstr /C:" " >nul ^
+&& (call :print_highlight "This script relies on Miniconda which can not be silently installed under a path with spaces." ^
+&& goto :end)
+goto :eof
+
+
+:deactivate_environment
+echo "Deactivate existing environment(s)"
+(call conda deactivate && call conda deactivate && call conda deactivate) 2>nul
+goto :eof
+
+
+:check_conda_existence
+echo "Check for conda existence"
+set conda_exists=F
+
+@rem figure out whether conda exists
+call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
+if "%ERRORLEVEL%" EQU "0" set conda_exists=T
+
+@rem verify if conda is installed by the main app, if not then raise error
+if "%conda_exists%" == "F" (
+	call :print_highlight "conda is not installed, seems like the app wasn't installed correctly."
+    goto :end
+)
+goto :eof
+
+
+:create_conda_environment
+@rem create the environment if needed
+if not exist "%INSTALL_ENV_DIR%" (
+    echo "Create conda environment"
+	call "%CONDA_ROOT_PREFIX%\_conda.exe" create ^
+        --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python="%PYTHON_VERSION%" || ^
+    ( echo. && call :print_highlight "Conda environment creation failed." && goto :end )
+)
+
+@rem check if conda environment was actually created
+if not exist "%INSTALL_ENV_DIR%\python.exe" (
+    call :print_highlight "Conda environment was not correctly created."
+    goto :end
+)
+goto :eof
+
+
+:isolate_environment
+echo "Isolate environment"
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+goto :eof
+
+
+:activate_environment
+echo "Activate conda environment"
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ^
+( echo. && call :print_highlight "Miniconda hook not found." && goto :end )
+goto :eof
+
+
+:end