LightRAG/lightrag/llm/ollama.py

"""
Ollama LLM Interface Module
==========================

This module provides interfaces for interacting with Ollama's language models,
including text generation and embedding capabilities.

Author: Lightrag team
Created: 2024-01-24
License: MIT License

Copyright (c) 2024 Lightrag

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

Version: 1.0.0

Change Log:
- 1.0.0 (2024-01-24): Initial release
    * Added async chat completion support
    * Added embedding generation
    * Added stream response capability

Dependencies:
    - ollama
    - numpy
    - pipmaster
    - Python >= 3.10

Usage:
    from llm_interfaces.ollama_interface import ollama_model_complete, ollama_embed
"""

__version__ = "1.0.0"
__author__ = "lightrag Team"
__status__ = "Production"

import sys

if sys.version_info < (3, 9):
    from typing import AsyncIterator
else:
    from collections.abc import AsyncIterator
import pipmaster as pm  # Pipmaster for dynamic library install

# install specific modules
if not pm.is_installed("ollama"):
    pm.install("ollama")
if not pm.is_installed("tenacity"):
    pm.install("tenacity")

import ollama
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from lightrag.exceptions import (
    APIConnectionError,
    RateLimitError,
    APITimeoutError,
)
from lightrag.api import __api_version__
from lightrag.utils import extract_reasoning
import numpy as np
from typing import Union


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=retry_if_exception_type(
        (RateLimitError, APIConnectionError, APITimeoutError)
    ),
)
async def ollama_model_if_cache(
    model,
    prompt,
    system_prompt=None,
    history_messages=[],
    **kwargs,
) -> Union[str, AsyncIterator[str]]:
    stream = True if kwargs.get("stream") else False
    reasoning_tag = kwargs.pop("reasoning_tag", None)
    kwargs.pop("max_tokens", None)
    # kwargs.pop("response_format", None) # allow json
    host = kwargs.pop("host", None)
    timeout = kwargs.pop("timeout", None)
    kwargs.pop("hashing_kv", None)
    api_key = kwargs.pop("api_key", None)
    headers = {
        "Content-Type": "application/json",
        "User-Agent": f"LightRAG/{__api_version__}",
    }
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    ollama_client = ollama.AsyncClient(host=host, timeout=timeout, headers=headers)
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.extend(history_messages)
    messages.append({"role": "user", "content": prompt})

    response = await ollama_client.chat(model=model, messages=messages, **kwargs)
    if stream:
        """cannot cache stream response and process reasoning"""

        async def inner():
            async for chunk in response:
                yield chunk["message"]["content"]

        return inner()
    else:
        model_response = response["message"]["content"]

        """
        If the model also wraps its thoughts in a specific tag,
        this information is not needed for the final
        response and can simply be trimmed.
        """

        return (
            model_response
            if reasoning_tag is None
            else extract_reasoning(model_response, reasoning_tag).response_content
        )


async def ollama_model_complete(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> Union[str, AsyncIterator[str]]:
    keyword_extraction = kwargs.pop("keyword_extraction", None)
    if keyword_extraction:
        kwargs["format"] = "json"
    model_name = kwargs["hashing_kv"].global_config["llm_model_name"]
    return await ollama_model_if_cache(
        model_name,
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        **kwargs,
    )


async def ollama_embedding(texts: list[str], embed_model, **kwargs) -> np.ndarray:
    """
    Deprecated in favor of `embed`.
    """
    embed_text = []
    ollama_client = ollama.Client(**kwargs)
    for text in texts:
        data = ollama_client.embeddings(model=embed_model, prompt=text)
        embed_text.append(data["embedding"])

    return embed_text


async def ollama_embed(texts: list[str], embed_model, **kwargs) -> np.ndarray:
    api_key = kwargs.pop("api_key", None)
    headers = {
        "Content-Type": "application/json",
        "User-Agent": f"LightRAG/{__api_version__}",
    }
    if api_key:
        headers["Authorization"] = api_key
    kwargs["headers"] = headers
    ollama_client = ollama.Client(**kwargs)
    data = ollama_client.embed(model=embed_model, input=texts)
    return data["embeddings"]
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`"""`
			`Ollama LLM Interface Module`
			`==========================`

			`This module provides interfaces for interacting with Ollama's language models,`
			`including text generation and embedding capabilities.`

			`Author: Lightrag team`
			`Created: 2024-01-24`
			`License: MIT License`

			`Copyright (c) 2024 Lightrag`

			`Permission is hereby granted, free of charge, to any person obtaining a copy`
			`of this software and associated documentation files (the "Software"), to deal`
			`in the Software without restriction, including without limitation the rights`
			`to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`copies of the Software, and to permit persons to whom the Software is`
			`furnished to do so, subject to the following conditions:`

			`Version: 1.0.0`

			`Change Log:`
			`- 1.0.0 (2024-01-24): Initial release`
			`* Added async chat completion support`
			`* Added embedding generation`
			`* Added stream response capability`

			`Dependencies:`
			`- ollama`
			`- numpy`
			`- pipmaster`
			`- Python >= 3.10`

			`Usage:`
			`from llm_interfaces.ollama_interface import ollama_model_complete, ollama_embed`
			`"""`

			`__version__ = "1.0.0"`
			`__author__ = "lightrag Team"`
			`__status__ = "Production"`

			`import sys`
Fixed missing imports bug and fixed linting 2025-01-25 00:55:07 +01:00
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`if sys.version_info < (3, 9):`
			`from typing import AsyncIterator`
			`else:`
			`from collections.abc import AsyncIterator`
Fixed missing imports bug and fixed linting 2025-01-25 00:55:07 +01:00			`import pipmaster as pm # Pipmaster for dynamic library install`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00
			`# install specific modules`
			`if not pm.is_installed("ollama"):`
			`pm.install("ollama")`
			`if not pm.is_installed("tenacity"):`
			`pm.install("tenacity")`

			`import ollama`
			`from tenacity import (`
			`retry,`
			`stop_after_attempt,`
			`wait_exponential,`
			`retry_if_exception_type,`
			`)`
			`from lightrag.exceptions import (`
			`APIConnectionError,`
			`RateLimitError,`
			`APITimeoutError,`
			`)`
Add LightRAG version to User-Agent header for better request tracking • Add User-Agent header with version info • Update header creation in Ollama client • Update header creation in OpenAI client • Ensure consistent header format • Include Mozilla UA string for OpenAI 2025-02-06 22:55:22 +08:00			`from lightrag.api import __api_version__`
feat: trimming the model’s reasoning 2025-02-06 22:56:17 +03:00			`from lightrag.utils import extract_reasoning`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`import numpy as np`
			`from typing import Union`


			`@retry(`
			`stop=stop_after_attempt(3),`
			`wait=wait_exponential(multiplier=1, min=4, max=10),`
			`retry=retry_if_exception_type(`
			`(RateLimitError, APIConnectionError, APITimeoutError)`
			`),`
			`)`
			`async def ollama_model_if_cache(`
			`model,`
			`prompt,`
			`system_prompt=None,`
			`history_messages=[],`
			`**kwargs,`
			`) -> Union[str, AsyncIterator[str]]:`
			`stream = True if kwargs.get("stream") else False`
feat: trimming the model’s reasoning 2025-02-06 22:56:17 +03:00			`reasoning_tag = kwargs.pop("reasoning_tag", None)`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`kwargs.pop("max_tokens", None)`
			`# kwargs.pop("response_format", None) # allow json`
			`host = kwargs.pop("host", None)`
			`timeout = kwargs.pop("timeout", None)`
			`kwargs.pop("hashing_kv", None)`
			`api_key = kwargs.pop("api_key", None)`
Add LightRAG version to User-Agent header for better request tracking • Add User-Agent header with version info • Update header creation in Ollama client • Update header creation in OpenAI client • Ensure consistent header format • Include Mozilla UA string for OpenAI 2025-02-06 22:55:22 +08:00			`headers = {`
			`"Content-Type": "application/json",`
			`"User-Agent": f"LightRAG/{__api_version__}",`
			`}`
			`if api_key:`
			`headers["Authorization"] = f"Bearer {api_key}"`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`ollama_client = ollama.AsyncClient(host=host, timeout=timeout, headers=headers)`
			`messages = []`
			`if system_prompt:`
			`messages.append({"role": "system", "content": system_prompt})`
			`messages.extend(history_messages)`
			`messages.append({"role": "user", "content": prompt})`

			`response = await ollama_client.chat(model=model, messages=messages, **kwargs)`
			`if stream:`
feat: trimming the model’s reasoning 2025-02-06 22:56:17 +03:00			`"""cannot cache stream response and process reasoning"""`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00
			`async def inner():`
			`async for chunk in response:`
			`yield chunk["message"]["content"]`

			`return inner()`
			`else:`
feat: trimming the model’s reasoning 2025-02-06 22:56:17 +03:00			`model_response = response["message"]["content"]`

			`"""`
			`If the model also wraps its thoughts in a specific tag,`
			`this information is not needed for the final`
			`response and can simply be trimmed.`
			`"""`

			`return (`
			`model_response`
			`if reasoning_tag is None`
			`else extract_reasoning(model_response, reasoning_tag).response_content`
			`)`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00
Fixed missing imports bug and fixed linting 2025-01-25 00:55:07 +01:00
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`async def ollama_model_complete(`
			`prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs`
			`) -> Union[str, AsyncIterator[str]]:`
			`keyword_extraction = kwargs.pop("keyword_extraction", None)`
			`if keyword_extraction:`
			`kwargs["format"] = "json"`
			`model_name = kwargs["hashing_kv"].global_config["llm_model_name"]`
			`return await ollama_model_if_cache(`
			`model_name,`
			`prompt,`
			`system_prompt=system_prompt,`
			`history_messages=history_messages,`
			`**kwargs,`
			`)`

Fixed missing imports bug and fixed linting 2025-01-25 00:55:07 +01:00
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`async def ollama_embedding(texts: list[str], embed_model, **kwargs) -> np.ndarray:`
			`"""`
			Deprecated in favor of `embed`.
			`"""`
			`embed_text = []`
			`ollama_client = ollama.Client(**kwargs)`
			`for text in texts:`
			`data = ollama_client.embeddings(model=embed_model, prompt=text)`
			`embed_text.append(data["embedding"])`

			`return embed_text`


			`async def ollama_embed(texts: list[str], embed_model, **kwargs) -> np.ndarray:`
			`api_key = kwargs.pop("api_key", None)`
Add LightRAG version to User-Agent header for better request tracking • Add User-Agent header with version info • Update header creation in Ollama client • Update header creation in OpenAI client • Ensure consistent header format • Include Mozilla UA string for OpenAI 2025-02-06 22:55:22 +08:00			`headers = {`
			`"Content-Type": "application/json",`
			`"User-Agent": f"LightRAG/{__api_version__}",`
			`}`
			`if api_key:`
			`headers["Authorization"] = api_key`
Separated llms from the main llm.py file and fixed some deprication bugs 2025-01-25 00:11:00 +01:00			`kwargs["headers"] = headers`
			`ollama_client = ollama.Client(**kwargs)`
			`data = ollama_client.embed(model=embed_model, input=texts)`
Fixed missing imports bug and fixed linting 2025-01-25 00:55:07 +01:00			`return data["embeddings"]`