Merge branch 'clear-text-before-insert' into simplify-cmdline-arguments

This commit is contained in:
yangdx 2025-02-22 10:07:46 +08:00
commit 2fd997ca4f
17 changed files with 1276 additions and 1162 deletions

View File

@ -3,19 +3,6 @@ FROM python:3.11-slim as builder
WORKDIR /app
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
pkg-config \
libssl-dev \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& . "$HOME/.cargo/env" \
&& rustup default stable \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/root/.cargo/bin:${PATH}"
# Copy only requirements files first to leverage Docker cache
COPY requirements.txt .
COPY lightrag/api/requirements.txt ./lightrag/api/

View File

@ -545,6 +545,20 @@ The `insert_batch_size` parameter in `addon_params` controls how many documents
</details>
<details>
<summary> <b> Insert with ID </b></summary>
If you want to provide your own IDs for your documents, number of documents and number of IDs must be the same.
```python
# Insert single text, and provide ID for it
rag.insert("TEXT1", ids=["ID_FOR_TEXT1"])
# Insert multiple texts, and provide IDs for them
rag.insert(["TEXT1", "TEXT2",...], ids=["ID_FOR_TEXT1", "ID_FOR_TEXT2"])
```
</details>
<details>
<summary><b>Incremental Insert</b></summary>

View File

@ -6,16 +6,8 @@ services:
volumes:
- ./data/rag_storage:/app/data/rag_storage
- ./data/inputs:/app/data/inputs
- .env:/app/.env
- ./config.ini:/app/config.ini
- ./.env:/app/.env
env_file:
- .env
environment:
- TZ=UTC
restart: unless-stopped
networks:
- lightrag_net
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
lightrag_net:
driver: bridge

View File

@ -37,20 +37,22 @@ async def main():
llm_model_max_token_size=32768,
enable_llm_cache_for_entity_extract=True,
embedding_func=EmbeddingFunc(
embedding_dim=768,
embedding_dim=1024,
max_token_size=8192,
func=lambda texts: ollama_embedding(
texts, embed_model="nomic-embed-text", host="http://localhost:11434"
texts, embed_model="bge-m3", host="http://localhost:11434"
),
),
kv_storage="PGKVStorage",
doc_status_storage="PGDocStatusStorage",
graph_storage="PGGraphStorage",
vector_storage="PGVectorStorage",
auto_manage_storages_states=False,
)
# add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c
rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func
await rag.initialize_storages()
with open(f"{ROOT_DIR}/book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read())

View File

@ -1,5 +1,5 @@
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
__version__ = "1.1.11"
__version__ = "1.2.1"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/LightRAG"

View File

@ -57,10 +57,9 @@ ALTER USER your_new_role WITH PASSWORD 'your_secure_password';
\q
```
### 3. Install PGVector Extension
Install necessary dependencies and compile the extension:
### 3. Install PGVector and Age Extensions
Install PGVector:
```bash
sudo apt install postgresql-server-dev-all
cd /tmp
@ -69,6 +68,15 @@ cd pgvector
make
sudo make install
```
Install age:
```bash
sudo apt-get install build-essential libpq-dev
cd /tmp
git clone https://github.com/apache/age.git
cd age
make
sudo make install
```
### 4. Create a Database for LightRAG

View File

@ -117,6 +117,37 @@ class DocumentManager:
".docx",
".pptx",
".xlsx",
".rtf", # Rich Text Format
".odt", # OpenDocument Text
".tex", # LaTeX
".epub", # Electronic Publication
".html", # HyperText Markup Language
".htm", # HyperText Markup Language
".csv", # Comma-Separated Values
".json", # JavaScript Object Notation
".xml", # eXtensible Markup Language
".yaml", # YAML Ain't Markup Language
".yml", # YAML
".log", # Log files
".conf", # Configuration files
".ini", # Initialization files
".properties", # Java properties files
".sql", # SQL scripts
".bat", # Batch files
".sh", # Shell scripts
".c", # C source code
".cpp", # C++ source code
".py", # Python source code
".java", # Java source code
".js", # JavaScript source code
".ts", # TypeScript source code
".swift", # Swift source code
".go", # Go source code
".rb", # Ruby source code
".php", # PHP source code
".css", # Cascading Style Sheets
".scss", # Sassy CSS
".less", # LESS CSS
),
):
self.input_dir = Path(input_dir)
@ -170,7 +201,41 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
# Process based on file type
match ext:
case ".txt" | ".md":
case (
".txt"
| ".md"
| ".html"
| ".htm"
| ".tex"
| ".json"
| ".xml"
| ".yaml"
| ".yml"
| ".rtf"
| ".odt"
| ".epub"
| ".csv"
| ".log"
| ".conf"
| ".ini"
| ".properties"
| ".sql"
| ".bat"
| ".sh"
| ".c"
| ".cpp"
| ".py"
| ".java"
| ".js"
| ".ts"
| ".swift"
| ".go"
| ".rb"
| ".php"
| ".css"
| ".scss"
| ".less"
):
content = file.decode("utf-8")
case ".pdf":
if not pm.is_installed("pypdf2"):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="./vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title>
<script type="module" crossorigin src="./assets/index-gr1CNi7P.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-Cq9iD15S.css">
<script type="module" crossorigin src="./assets/index-BDX8o1Ld.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-CLsJV-0i.css">
</head>
<body>
<div id="root"></div>

View File

@ -254,6 +254,8 @@ class PGKVStorage(BaseKVStorage):
db: PostgreSQLDB = field(default=None)
def __post_init__(self):
namespace_prefix = self.global_config.get("namespace_prefix")
self.base_namespace = self.namespace.replace(namespace_prefix, "")
self._max_batch_size = self.global_config["embedding_batch_num"]
async def initialize(self):
@ -269,7 +271,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get doc_full data by id."""
sql = SQL_TEMPLATES["get_by_id_" + self.namespace]
sql = SQL_TEMPLATES["get_by_id_" + self.base_namespace]
params = {"workspace": self.db.workspace, "id": id}
if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
array_res = await self.db.query(sql, params, multirows=True)
@ -283,7 +285,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]:
"""Specifically for llm_response_cache."""
sql = SQL_TEMPLATES["get_by_mode_id_" + self.namespace]
sql = SQL_TEMPLATES["get_by_mode_id_" + self.base_namespace]
params = {"workspace": self.db.workspace, mode: mode, "id": id}
if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
array_res = await self.db.query(sql, params, multirows=True)
@ -297,7 +299,7 @@ class PGKVStorage(BaseKVStorage):
# Query by id
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get doc_chunks data by id"""
sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format(
sql = SQL_TEMPLATES["get_by_ids_" + self.base_namespace].format(
ids=",".join([f"'{id}'" for id in ids])
)
params = {"workspace": self.db.workspace}
@ -318,7 +320,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]:
"""Specifically for llm_response_cache."""
SQL = SQL_TEMPLATES["get_by_status_" + self.namespace]
SQL = SQL_TEMPLATES["get_by_status_" + self.base_namespace]
params = {"workspace": self.db.workspace, "status": status}
return await self.db.query(SQL, params, multirows=True)
@ -391,6 +393,8 @@ class PGVectorStorage(BaseVectorStorage):
def __post_init__(self):
self._max_batch_size = self.global_config["embedding_batch_num"]
namespace_prefix = self.global_config.get("namespace_prefix")
self.base_namespace = self.namespace.replace(namespace_prefix, "")
config = self.global_config.get("vector_db_storage_cls_kwargs", {})
cosine_threshold = config.get("cosine_better_than_threshold")
if cosine_threshold is None:
@ -493,7 +497,9 @@ class PGVectorStorage(BaseVectorStorage):
embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding))
sql = SQL_TEMPLATES[self.namespace].format(embedding_string=embedding_string)
sql = SQL_TEMPLATES[self.base_namespace].format(
embedding_string=embedding_string
)
params = {
"workspace": self.db.workspace,
"better_than_threshold": self.cosine_better_than_threshold,

View File

@ -1,8 +1,8 @@
from __future__ import annotations
import asyncio
import os
import configparser
import os
from dataclasses import asdict, dataclass, field
from datetime import datetime
from functools import partial
@ -41,11 +41,11 @@ from .utils import (
always_get_an_event_loop,
compute_mdhash_id,
convert_response_to_json,
encode_string_by_tiktoken,
lazy_external_import,
limit_async_func_call,
logger,
set_logger,
encode_string_by_tiktoken,
)
from .types import KnowledgeGraph
@ -483,6 +483,7 @@ class LightRAG:
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: list[str] | None = None,
) -> None:
"""Sync Insert documents with checkpoint support
@ -491,10 +492,11 @@ class LightRAG:
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
"""
loop = always_get_an_event_loop()
loop.run_until_complete(
self.ainsert(input, split_by_character, split_by_character_only)
self.ainsert(input, split_by_character, split_by_character_only, ids)
)
async def ainsert(
@ -502,6 +504,7 @@ class LightRAG:
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: list[str] | None = None,
) -> None:
"""Async Insert documents with checkpoint support
@ -510,8 +513,9 @@ class LightRAG:
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
"""
await self.apipeline_enqueue_documents(input)
await self.apipeline_enqueue_documents(input, ids)
await self.apipeline_process_enqueue_documents(
split_by_character, split_by_character_only
)
@ -533,7 +537,7 @@ class LightRAG:
doc_key = compute_mdhash_id(full_text, prefix="doc-")
new_docs = {doc_key: {"content": full_text}}
_add_doc_keys = await self.full_docs.filter_keys(set(doc_key))
_add_doc_keys = await self.full_docs.filter_keys({doc_key})
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if not len(new_docs):
logger.warning("This document is already in the storage.")
@ -572,24 +576,54 @@ class LightRAG:
if update_storage:
await self._insert_done()
async def apipeline_enqueue_documents(self, input: str | list[str]) -> None:
async def apipeline_enqueue_documents(
self, input: str | list[str], ids: list[str] | None
) -> None:
"""
Pipeline for Processing Documents
1. Remove duplicate contents from the list
2. Generate document IDs and initial status
3. Filter out already processed documents
4. Enqueue document in status
1. Validate ids if provided or generate MD5 hash IDs
2. Remove duplicate contents
3. Generate document initial status
4. Filter out already processed documents
5. Enqueue document in status
"""
if isinstance(input, str):
input = [input]
# Clean input text and remove duplicates
unique_contents = list(set(self.clean_text(doc) for doc in input))
# Clean input text and remove duplicates
input = list(set(self.clean_text(doc) for doc in input))
# 2. Generate document IDs and initial status
# 1. Validate ids if provided or generate MD5 hash IDs
if ids is not None:
# Check if the number of IDs matches the number of documents
if len(ids) != len(input):
raise ValueError("Number of IDs must match the number of documents")
# Check if IDs are unique
if len(ids) != len(set(ids)):
raise ValueError("IDs must be unique")
# Generate contents dict of IDs provided by user and documents
contents = {id_: doc for id_, doc in zip(ids, input)}
else:
# Generate contents dict of MD5 hash IDs and documents
contents = {
compute_mdhash_id(doc, prefix="doc-"): doc
for doc in input
}
# 2. Remove duplicate contents
unique_contents = {
id_: content
for content, id_ in {
content: id_ for id_, content in contents.items()
}.items()
}
# 3. Generate document initial status
new_docs: dict[str, Any] = {
compute_mdhash_id(content, prefix="doc-"): {
id_: {
"content": content,
"content_summary": self._get_content_summary(content),
"content_length": len(content),
@ -597,10 +631,10 @@ class LightRAG:
"created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat(),
}
for content in unique_contents
for id_, content in unique_contents.items()
}
# 3. Filter out already processed documents
# 4. Filter out already processed documents
# Get docs ids
all_new_doc_ids = set(new_docs.keys())
# Exclude IDs of documents that are already in progress
@ -612,7 +646,7 @@ class LightRAG:
logger.info("No new unique documents were found.")
return
# 4. Store status document
# 5. Store status document
await self.doc_status.upsert(new_docs)
logger.info(f"Stored {len(new_docs)} new unique documents")
@ -669,8 +703,6 @@ class LightRAG:
# 4. iterate over batch
for doc_id_processing_status in docs_batch:
doc_id, status_doc = doc_id_processing_status
# Update status in processing
doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-")
# Generate chunks from document
chunks: dict[str, Any] = {
compute_mdhash_id(dp["content"], prefix="chunk-"): {
@ -690,7 +722,7 @@ class LightRAG:
tasks = [
self.doc_status.upsert(
{
doc_status_id: {
doc_id: {
"status": DocStatus.PROCESSING,
"updated_at": datetime.now().isoformat(),
"content": status_doc.content,
@ -711,7 +743,7 @@ class LightRAG:
await asyncio.gather(*tasks)
await self.doc_status.upsert(
{
doc_status_id: {
doc_id: {
"status": DocStatus.PROCESSED,
"chunks_count": len(chunks),
"content": status_doc.content,
@ -726,7 +758,7 @@ class LightRAG:
logger.error(f"Failed to process document {doc_id}: {str(e)}")
await self.doc_status.upsert(
{
doc_status_id: {
doc_id: {
"status": DocStatus.FAILED,
"error": str(e),
"content": status_doc.content,

View File

@ -7,7 +7,10 @@
"dev": "bunx --bun vite",
"build": "bunx --bun vite build",
"lint": "eslint .",
"preview": "bunx --bun vite preview"
"preview": "bunx --bun vite preview",
"dev-no-bun": "vite",
"build-no-bun": "vite build --emptyOutDir",
"preview-no-bun": "vite preview"
},
"dependencies": {
"@faker-js/faker": "^9.5.0",

View File

@ -80,7 +80,7 @@ export default function UploadDocumentsDialog() {
<FileUploader
maxFileCount={Infinity}
maxSize={200 * 1024 * 1024}
description="supported types: TXT, MD, DOC, PDF, PPTX"
description="supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
onUpload={handleDocumentsUpload}
progresses={progresses}
disabled={isUploading}

View File

@ -247,7 +247,7 @@ function FileUploader(props: FileUploaderProps) {
? ` ${maxFileCount === Infinity ? 'multiple' : maxFileCount}
files (up to ${formatBytes(maxSize)} each)`
: ` a file with ${formatBytes(maxSize)}`}
Supported formats: TXT, MD, DOC, PDF, PPTX
Supported formats: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS
</p>
)}
</div>

View File

@ -26,7 +26,12 @@ export const defaultQueryLabel = '*'
// reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/MIME_types/Common_types
export const supportedFileTypes = {
'text/plain': ['.txt', '.md'],
'text/plain': [
'.txt', '.md', '.html', '.htm', '.tex', '.json', '.xml', '.yaml', '.yml',
'.rtf', '.odt', '.epub', '.csv', '.log', '.conf', '.ini', '.properties',
'.sql', '.bat', '.sh', '.c', '.cpp', '.py', '.java', '.js', '.ts',
'.swift', '.go', '.rb', '.php', '.css', '.scss', '.less'
],
'application/pdf': ['.pdf'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],