Merge branch 'clear-text-before-insert' into simplify-cmdline-arguments

2025-11-14 17:13:13 +00:00 · 2025-02-22 10:07:46 +08:00 · 2025-02-22 10:07:46 +08:00 · 2fd997ca4f
commit 2fd997ca4f
parent 2fdbbc2062 2fff9ae1ab
17 changed files with 1276 additions and 1162 deletions
--- a/13
+++ b/13
@ -3,19 +3,6 @@ FROM python:3.11-slim as builder

 WORKDIR /app

-# Install build dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    curl \
-    pkg-config \
-    libssl-dev \
-    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
-    && . "$HOME/.cargo/env" \
-    && rustup default stable \
-    && rm -rf /var/lib/apt/lists/*
-
-ENV PATH="/root/.cargo/bin:${PATH}"
-
 # Copy only requirements files first to leverage Docker cache
 COPY requirements.txt .
 COPY lightrag/api/requirements.txt ./lightrag/api/
--- a/README.md
+++ b/README.md
@ -545,6 +545,20 @@ The `insert_batch_size` parameter in `addon_params` controls how many documents

 </details>

+<details>
+  <summary> <b> Insert with ID </b></summary>
+
+If you want to provide your own IDs for your documents, number of documents and number of IDs must be the same.
+
+```python
+# Insert single text, and provide ID for it
+rag.insert("TEXT1", ids=["ID_FOR_TEXT1"])
+
+# Insert multiple texts, and provide IDs for them
+rag.insert(["TEXT1", "TEXT2",...], ids=["ID_FOR_TEXT1", "ID_FOR_TEXT2"])
+```
+
+</details>

 <details>
  <summary><b>Incremental Insert</b></summary>
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -6,16 +6,8 @@ services:
    volumes:
      - ./data/rag_storage:/app/data/rag_storage
      - ./data/inputs:/app/data/inputs
-      - .env:/app/.env
+      - ./config.ini:/app/config.ini
+      - ./.env:/app/.env
    env_file:
      - .env
-    environment:
-      - TZ=UTC
    restart: unless-stopped
-    networks:
-      - lightrag_net
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-networks:
-  lightrag_net:
-    driver: bridge
--- a/examples/lightrag_zhipu_postgres_demo.py
+++ b/examples/lightrag_zhipu_postgres_demo.py
@ -37,20 +37,22 @@ async def main():
        llm_model_max_token_size=32768,
        enable_llm_cache_for_entity_extract=True,
        embedding_func=EmbeddingFunc(
-            embedding_dim=768,
+            embedding_dim=1024,
            max_token_size=8192,
            func=lambda texts: ollama_embedding(
-                texts, embed_model="nomic-embed-text", host="http://localhost:11434"
+                texts, embed_model="bge-m3", host="http://localhost:11434"
            ),
        ),
        kv_storage="PGKVStorage",
        doc_status_storage="PGDocStatusStorage",
        graph_storage="PGGraphStorage",
        vector_storage="PGVectorStorage",
+        auto_manage_storages_states=False,
    )

    # add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c
    rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func
+    await rag.initialize_storages()

    with open(f"{ROOT_DIR}/book.txt", "r", encoding="utf-8") as f:
        await rag.ainsert(f.read())
--- a/lightrag/init.py
+++ b/lightrag/init.py
@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam

-__version__ = "1.1.11"
+__version__ = "1.2.1"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"
--- a/lightrag/api/docs/LightRagWithPostGRESQL.md
+++ b/lightrag/api/docs/LightRagWithPostGRESQL.md
@ -57,10 +57,9 @@ ALTER USER your_new_role WITH PASSWORD 'your_secure_password';
 \q
 ```

-### 3. Install PGVector Extension
-
-Install necessary dependencies and compile the extension:
+### 3. Install PGVector and Age Extensions

+Install PGVector:
 ```bash
 sudo apt install postgresql-server-dev-all
 cd /tmp
@ -69,6 +68,15 @@ cd pgvector
 make
 sudo make install
 ```
+Install age:
+```bash
+sudo apt-get install build-essential libpq-dev
+cd /tmp
+git clone https://github.com/apache/age.git
+cd age
+make
+sudo make install
+```

 ### 4. Create a Database for LightRAG

--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -117,6 +117,37 @@ class DocumentManager:
            ".docx",
            ".pptx",
            ".xlsx",
+            ".rtf",  # Rich Text Format
+            ".odt",  # OpenDocument Text
+            ".tex",  # LaTeX
+            ".epub",  # Electronic Publication
+            ".html",  # HyperText Markup Language
+            ".htm",  # HyperText Markup Language
+            ".csv",  # Comma-Separated Values
+            ".json",  # JavaScript Object Notation
+            ".xml",  # eXtensible Markup Language
+            ".yaml",  # YAML Ain't Markup Language
+            ".yml",  # YAML
+            ".log",  # Log files
+            ".conf",  # Configuration files
+            ".ini",  # Initialization files
+            ".properties",  # Java properties files
+            ".sql",  # SQL scripts
+            ".bat",  # Batch files
+            ".sh",  # Shell scripts
+            ".c",  # C source code
+            ".cpp",  # C++ source code
+            ".py",  # Python source code
+            ".java",  # Java source code
+            ".js",  # JavaScript source code
+            ".ts",  # TypeScript source code
+            ".swift",  # Swift source code
+            ".go",  # Go source code
+            ".rb",  # Ruby source code
+            ".php",  # PHP source code
+            ".css",  # Cascading Style Sheets
+            ".scss",  # Sassy CSS
+            ".less",  # LESS CSS
        ),
    ):
        self.input_dir = Path(input_dir)
@ -170,7 +201,41 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:

        # Process based on file type
        match ext:
-            case ".txt" | ".md":
+            case (
+                ".txt"
+                | ".md"
+                | ".html"
+                | ".htm"
+                | ".tex"
+                | ".json"
+                | ".xml"
+                | ".yaml"
+                | ".yml"
+                | ".rtf"
+                | ".odt"
+                | ".epub"
+                | ".csv"
+                | ".log"
+                | ".conf"
+                | ".ini"
+                | ".properties"
+                | ".sql"
+                | ".bat"
+                | ".sh"
+                | ".c"
+                | ".cpp"
+                | ".py"
+                | ".java"
+                | ".js"
+                | ".ts"
+                | ".swift"
+                | ".go"
+                | ".rb"
+                | ".php"
+                | ".css"
+                | ".scss"
+                | ".less"
+            ):
                content = file.decode("utf-8")
            case ".pdf":
                if not pm.is_installed("pypdf2"):
--- a/lightrag/api/webui/assets/index-BDX8o1Ld.js
+++ b/lightrag/api/webui/assets/index-BDX8o1Ld.js
--- a/lightrag/api/webui/assets/index-CLsJV-0i.css
+++ b/lightrag/api/webui/assets/index-CLsJV-0i.css
--- a/lightrag/api/webui/assets/index-gr1CNi7P.js
+++ b/lightrag/api/webui/assets/index-gr1CNi7P.js
--- a/lightrag/api/webui/index.html
+++ b/lightrag/api/webui/index.html
@ -5,8 +5,8 @@
    <link rel="icon" type="image/svg+xml" href="./vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Lightrag</title>
-    <script type="module" crossorigin src="./assets/index-gr1CNi7P.js"></script>
-    <link rel="stylesheet" crossorigin href="./assets/index-Cq9iD15S.css">
+    <script type="module" crossorigin src="./assets/index-BDX8o1Ld.js"></script>
+    <link rel="stylesheet" crossorigin href="./assets/index-CLsJV-0i.css">
  </head>
  <body>
    <div id="root"></div>
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@ -254,6 +254,8 @@ class PGKVStorage(BaseKVStorage):
    db: PostgreSQLDB = field(default=None)

    def __post_init__(self):
+        namespace_prefix = self.global_config.get("namespace_prefix")
+        self.base_namespace = self.namespace.replace(namespace_prefix, "")
        self._max_batch_size = self.global_config["embedding_batch_num"]

    async def initialize(self):
@ -269,7 +271,7 @@ class PGKVStorage(BaseKVStorage):

    async def get_by_id(self, id: str) -> dict[str, Any] | None:
        """Get doc_full data by id."""
-        sql = SQL_TEMPLATES["get_by_id_" + self.namespace]
+        sql = SQL_TEMPLATES["get_by_id_" + self.base_namespace]
        params = {"workspace": self.db.workspace, "id": id}
        if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
            array_res = await self.db.query(sql, params, multirows=True)
@ -283,7 +285,7 @@ class PGKVStorage(BaseKVStorage):

    async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]:
        """Specifically for llm_response_cache."""
-        sql = SQL_TEMPLATES["get_by_mode_id_" + self.namespace]
+        sql = SQL_TEMPLATES["get_by_mode_id_" + self.base_namespace]
        params = {"workspace": self.db.workspace, mode: mode, "id": id}
        if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
            array_res = await self.db.query(sql, params, multirows=True)
@ -297,7 +299,7 @@ class PGKVStorage(BaseKVStorage):
    # Query by id
    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
        """Get doc_chunks data by id"""
-        sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format(
+        sql = SQL_TEMPLATES["get_by_ids_" + self.base_namespace].format(
            ids=",".join([f"'{id}'" for id in ids])
        )
        params = {"workspace": self.db.workspace}
@ -318,7 +320,7 @@ class PGKVStorage(BaseKVStorage):

    async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]:
        """Specifically for llm_response_cache."""
-        SQL = SQL_TEMPLATES["get_by_status_" + self.namespace]
+        SQL = SQL_TEMPLATES["get_by_status_" + self.base_namespace]
        params = {"workspace": self.db.workspace, "status": status}
        return await self.db.query(SQL, params, multirows=True)

@ -391,6 +393,8 @@ class PGVectorStorage(BaseVectorStorage):

    def __post_init__(self):
        self._max_batch_size = self.global_config["embedding_batch_num"]
+        namespace_prefix = self.global_config.get("namespace_prefix")
+        self.base_namespace = self.namespace.replace(namespace_prefix, "")
        config = self.global_config.get("vector_db_storage_cls_kwargs", {})
        cosine_threshold = config.get("cosine_better_than_threshold")
        if cosine_threshold is None:
@ -493,7 +497,9 @@ class PGVectorStorage(BaseVectorStorage):
        embedding = embeddings[0]
        embedding_string = ",".join(map(str, embedding))

-        sql = SQL_TEMPLATES[self.namespace].format(embedding_string=embedding_string)
+        sql = SQL_TEMPLATES[self.base_namespace].format(
+            embedding_string=embedding_string
+        )
        params = {
            "workspace": self.db.workspace,
            "better_than_threshold": self.cosine_better_than_threshold,
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1,8 +1,8 @@
 from __future__ import annotations

 import asyncio
-import os
 import configparser
+import os
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
@ -41,11 +41,11 @@ from .utils import (
    always_get_an_event_loop,
    compute_mdhash_id,
    convert_response_to_json,
+    encode_string_by_tiktoken,
    lazy_external_import,
    limit_async_func_call,
    logger,
    set_logger,
-    encode_string_by_tiktoken,
 )
 from .types import KnowledgeGraph

@ -483,6 +483,7 @@ class LightRAG:
        input: str | list[str],
        split_by_character: str | None = None,
        split_by_character_only: bool = False,
+        ids: list[str] | None = None,
    ) -> None:
        """Sync Insert documents with checkpoint support

@ -491,10 +492,11 @@ class LightRAG:
            split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
            split_by_character_only: if split_by_character_only is True, split the string by character only, when
            split_by_character is None, this parameter is ignored.
+            ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
        """
        loop = always_get_an_event_loop()
        loop.run_until_complete(
-            self.ainsert(input, split_by_character, split_by_character_only)
+            self.ainsert(input, split_by_character, split_by_character_only, ids)
        )

    async def ainsert(
@ -502,6 +504,7 @@ class LightRAG:
        input: str | list[str],
        split_by_character: str | None = None,
        split_by_character_only: bool = False,
+        ids: list[str] | None = None,
    ) -> None:
        """Async Insert documents with checkpoint support

@ -510,8 +513,9 @@ class LightRAG:
            split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
            split_by_character_only: if split_by_character_only is True, split the string by character only, when
            split_by_character is None, this parameter is ignored.
+            ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
        """
-        await self.apipeline_enqueue_documents(input)
+        await self.apipeline_enqueue_documents(input, ids)
        await self.apipeline_process_enqueue_documents(
            split_by_character, split_by_character_only
        )
@ -533,7 +537,7 @@ class LightRAG:
            doc_key = compute_mdhash_id(full_text, prefix="doc-")
            new_docs = {doc_key: {"content": full_text}}

-            _add_doc_keys = await self.full_docs.filter_keys(set(doc_key))
+            _add_doc_keys = await self.full_docs.filter_keys({doc_key})
            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
            if not len(new_docs):
                logger.warning("This document is already in the storage.")
@ -572,24 +576,54 @@ class LightRAG:
            if update_storage:
                await self._insert_done()

-    async def apipeline_enqueue_documents(self, input: str | list[str]) -> None:
+    async def apipeline_enqueue_documents(
+        self, input: str | list[str], ids: list[str] | None
+    ) -> None:
        """
        Pipeline for Processing Documents

-        1. Remove duplicate contents from the list
-        2. Generate document IDs and initial status
-        3. Filter out already processed documents
-        4. Enqueue document in status
+        1. Validate ids if provided or generate MD5 hash IDs
+        2. Remove duplicate contents
+        3. Generate document initial status
+        4. Filter out already processed documents
+        5. Enqueue document in status
        """
        if isinstance(input, str):
            input = [input]

-        # Clean input text and remove duplicates
-        unique_contents = list(set(self.clean_text(doc) for doc in input))
+        # Clean input text and remove duplicates            
+        input = list(set(self.clean_text(doc) for doc in input))

-        # 2. Generate document IDs and initial status
+        # 1. Validate ids if provided or generate MD5 hash IDs
+        if ids is not None:
+            # Check if the number of IDs matches the number of documents
+            if len(ids) != len(input):
+                raise ValueError("Number of IDs must match the number of documents")
+
+            # Check if IDs are unique
+            if len(ids) != len(set(ids)):
+                raise ValueError("IDs must be unique")
+
+            # Generate contents dict of IDs provided by user and documents
+            contents = {id_: doc for id_, doc in zip(ids, input)}
+        else:
+            # Generate contents dict of MD5 hash IDs and documents
+            contents = {
+                compute_mdhash_id(doc, prefix="doc-"): doc
+                for doc in input
+            }
+
+        # 2. Remove duplicate contents
+        unique_contents = {
+            id_: content
+            for content, id_ in {
+                content: id_ for id_, content in contents.items()
+            }.items()
+        }
+
+        # 3. Generate document initial status
        new_docs: dict[str, Any] = {
-            compute_mdhash_id(content, prefix="doc-"): {
+            id_: {
                "content": content,
                "content_summary": self._get_content_summary(content),
                "content_length": len(content),
@ -597,10 +631,10 @@ class LightRAG:
                "created_at": datetime.now().isoformat(),
                "updated_at": datetime.now().isoformat(),
            }
-            for content in unique_contents
+            for id_, content in unique_contents.items()
        }

-        # 3. Filter out already processed documents
+        # 4. Filter out already processed documents
        # Get docs ids
        all_new_doc_ids = set(new_docs.keys())
        # Exclude IDs of documents that are already in progress
@ -612,7 +646,7 @@ class LightRAG:
            logger.info("No new unique documents were found.")
            return

-        # 4. Store status document
+        # 5. Store status document
        await self.doc_status.upsert(new_docs)
        logger.info(f"Stored {len(new_docs)} new unique documents")

@ -669,8 +703,6 @@ class LightRAG:
                # 4. iterate over batch
                for doc_id_processing_status in docs_batch:
                    doc_id, status_doc = doc_id_processing_status
-                    # Update status in processing
-                    doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-")
                    # Generate chunks from document
                    chunks: dict[str, Any] = {
                        compute_mdhash_id(dp["content"], prefix="chunk-"): {
@ -690,7 +722,7 @@ class LightRAG:
                    tasks = [
                        self.doc_status.upsert(
                            {
-                                doc_status_id: {
+                                doc_id: {
                                    "status": DocStatus.PROCESSING,
                                    "updated_at": datetime.now().isoformat(),
                                    "content": status_doc.content,
@ -711,7 +743,7 @@ class LightRAG:
                        await asyncio.gather(*tasks)
                        await self.doc_status.upsert(
                            {
-                                doc_status_id: {
+                                doc_id: {
                                    "status": DocStatus.PROCESSED,
                                    "chunks_count": len(chunks),
                                    "content": status_doc.content,
@ -726,7 +758,7 @@ class LightRAG:
                        logger.error(f"Failed to process document {doc_id}: {str(e)}")
                        await self.doc_status.upsert(
                            {
-                                doc_status_id: {
+                                doc_id: {
                                    "status": DocStatus.FAILED,
                                    "error": str(e),
                                    "content": status_doc.content,
--- a/lightrag_webui/package.json
+++ b/lightrag_webui/package.json
@ -7,7 +7,10 @@
    "dev": "bunx --bun vite",
    "build": "bunx --bun vite build",
    "lint": "eslint .",
-    "preview": "bunx --bun vite preview"
+    "preview": "bunx --bun vite preview",
+    "dev-no-bun": "vite",
+    "build-no-bun": "vite build --emptyOutDir",
+    "preview-no-bun": "vite preview"
  },
  "dependencies": {
    "@faker-js/faker": "^9.5.0",
--- a/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx
+++ b/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx
@ -80,7 +80,7 @@ export default function UploadDocumentsDialog() {
        <FileUploader
          maxFileCount={Infinity}
          maxSize={200 * 1024 * 1024}
-          description="supported types: TXT, MD, DOC, PDF, PPTX"
+          description="supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
          onUpload={handleDocumentsUpload}
          progresses={progresses}
          disabled={isUploading}
--- a/lightrag_webui/src/components/ui/FileUploader.tsx
+++ b/lightrag_webui/src/components/ui/FileUploader.tsx
@ -247,7 +247,7 @@ function FileUploader(props: FileUploaderProps) {
                        ? ` ${maxFileCount === Infinity ? 'multiple' : maxFileCount}
                      files (up to ${formatBytes(maxSize)} each)`
                        : ` a file with ${formatBytes(maxSize)}`}
-                      Supported formats: TXT, MD, DOC, PDF, PPTX
+                      Supported formats: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS
                    </p>
                  )}
                </div>
--- a/lightrag_webui/src/lib/constants.ts
+++ b/lightrag_webui/src/lib/constants.ts
@ -26,7 +26,12 @@ export const defaultQueryLabel = '*'

 // reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/MIME_types/Common_types
 export const supportedFileTypes = {
-  'text/plain': ['.txt', '.md'],
+  'text/plain': [
+      '.txt', '.md', '.html', '.htm', '.tex', '.json', '.xml', '.yaml', '.yml',
+      '.rtf', '.odt', '.epub', '.csv', '.log', '.conf', '.ini', '.properties',
+      '.sql', '.bat', '.sh', '.c', '.cpp', '.py', '.java', '.js', '.ts',
+      '.swift', '.go', '.rb', '.php', '.css', '.scss', '.less'
+  ],
  'application/pdf': ['.pdf'],
  'application/msword': ['.doc'],
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],