Merge branch 'clear-text-before-insert' into simplify-cmdline-arguments

This commit is contained in:
yangdx 2025-02-22 10:07:46 +08:00
commit 2fd997ca4f
17 changed files with 1276 additions and 1162 deletions

View File

@ -3,19 +3,6 @@ FROM python:3.11-slim as builder
WORKDIR /app WORKDIR /app
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
pkg-config \
libssl-dev \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& . "$HOME/.cargo/env" \
&& rustup default stable \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/root/.cargo/bin:${PATH}"
# Copy only requirements files first to leverage Docker cache # Copy only requirements files first to leverage Docker cache
COPY requirements.txt . COPY requirements.txt .
COPY lightrag/api/requirements.txt ./lightrag/api/ COPY lightrag/api/requirements.txt ./lightrag/api/

View File

@ -545,6 +545,20 @@ The `insert_batch_size` parameter in `addon_params` controls how many documents
</details> </details>
<details>
<summary> <b> Insert with ID </b></summary>
If you want to provide your own IDs for your documents, number of documents and number of IDs must be the same.
```python
# Insert single text, and provide ID for it
rag.insert("TEXT1", ids=["ID_FOR_TEXT1"])
# Insert multiple texts, and provide IDs for them
rag.insert(["TEXT1", "TEXT2",...], ids=["ID_FOR_TEXT1", "ID_FOR_TEXT2"])
```
</details>
<details> <details>
<summary><b>Incremental Insert</b></summary> <summary><b>Incremental Insert</b></summary>

View File

@ -6,16 +6,8 @@ services:
volumes: volumes:
- ./data/rag_storage:/app/data/rag_storage - ./data/rag_storage:/app/data/rag_storage
- ./data/inputs:/app/data/inputs - ./data/inputs:/app/data/inputs
- .env:/app/.env - ./config.ini:/app/config.ini
- ./.env:/app/.env
env_file: env_file:
- .env - .env
environment:
- TZ=UTC
restart: unless-stopped restart: unless-stopped
networks:
- lightrag_net
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
lightrag_net:
driver: bridge

View File

@ -37,20 +37,22 @@ async def main():
llm_model_max_token_size=32768, llm_model_max_token_size=32768,
enable_llm_cache_for_entity_extract=True, enable_llm_cache_for_entity_extract=True,
embedding_func=EmbeddingFunc( embedding_func=EmbeddingFunc(
embedding_dim=768, embedding_dim=1024,
max_token_size=8192, max_token_size=8192,
func=lambda texts: ollama_embedding( func=lambda texts: ollama_embedding(
texts, embed_model="nomic-embed-text", host="http://localhost:11434" texts, embed_model="bge-m3", host="http://localhost:11434"
), ),
), ),
kv_storage="PGKVStorage", kv_storage="PGKVStorage",
doc_status_storage="PGDocStatusStorage", doc_status_storage="PGDocStatusStorage",
graph_storage="PGGraphStorage", graph_storage="PGGraphStorage",
vector_storage="PGVectorStorage", vector_storage="PGVectorStorage",
auto_manage_storages_states=False,
) )
# add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c # add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c
rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func
await rag.initialize_storages()
with open(f"{ROOT_DIR}/book.txt", "r", encoding="utf-8") as f: with open(f"{ROOT_DIR}/book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read()) await rag.ainsert(f.read())

View File

@ -1,5 +1,5 @@
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
__version__ = "1.1.11" __version__ = "1.2.1"
__author__ = "Zirui Guo" __author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/LightRAG" __url__ = "https://github.com/HKUDS/LightRAG"

View File

@ -57,10 +57,9 @@ ALTER USER your_new_role WITH PASSWORD 'your_secure_password';
\q \q
``` ```
### 3. Install PGVector Extension ### 3. Install PGVector and Age Extensions
Install necessary dependencies and compile the extension:
Install PGVector:
```bash ```bash
sudo apt install postgresql-server-dev-all sudo apt install postgresql-server-dev-all
cd /tmp cd /tmp
@ -69,6 +68,15 @@ cd pgvector
make make
sudo make install sudo make install
``` ```
Install age:
```bash
sudo apt-get install build-essential libpq-dev
cd /tmp
git clone https://github.com/apache/age.git
cd age
make
sudo make install
```
### 4. Create a Database for LightRAG ### 4. Create a Database for LightRAG

View File

@ -117,6 +117,37 @@ class DocumentManager:
".docx", ".docx",
".pptx", ".pptx",
".xlsx", ".xlsx",
".rtf", # Rich Text Format
".odt", # OpenDocument Text
".tex", # LaTeX
".epub", # Electronic Publication
".html", # HyperText Markup Language
".htm", # HyperText Markup Language
".csv", # Comma-Separated Values
".json", # JavaScript Object Notation
".xml", # eXtensible Markup Language
".yaml", # YAML Ain't Markup Language
".yml", # YAML
".log", # Log files
".conf", # Configuration files
".ini", # Initialization files
".properties", # Java properties files
".sql", # SQL scripts
".bat", # Batch files
".sh", # Shell scripts
".c", # C source code
".cpp", # C++ source code
".py", # Python source code
".java", # Java source code
".js", # JavaScript source code
".ts", # TypeScript source code
".swift", # Swift source code
".go", # Go source code
".rb", # Ruby source code
".php", # PHP source code
".css", # Cascading Style Sheets
".scss", # Sassy CSS
".less", # LESS CSS
), ),
): ):
self.input_dir = Path(input_dir) self.input_dir = Path(input_dir)
@ -170,7 +201,41 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
# Process based on file type # Process based on file type
match ext: match ext:
case ".txt" | ".md": case (
".txt"
| ".md"
| ".html"
| ".htm"
| ".tex"
| ".json"
| ".xml"
| ".yaml"
| ".yml"
| ".rtf"
| ".odt"
| ".epub"
| ".csv"
| ".log"
| ".conf"
| ".ini"
| ".properties"
| ".sql"
| ".bat"
| ".sh"
| ".c"
| ".cpp"
| ".py"
| ".java"
| ".js"
| ".ts"
| ".swift"
| ".go"
| ".rb"
| ".php"
| ".css"
| ".scss"
| ".less"
):
content = file.decode("utf-8") content = file.decode("utf-8")
case ".pdf": case ".pdf":
if not pm.is_installed("pypdf2"): if not pm.is_installed("pypdf2"):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="./vite.svg" /> <link rel="icon" type="image/svg+xml" href="./vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title> <title>Lightrag</title>
<script type="module" crossorigin src="./assets/index-gr1CNi7P.js"></script> <script type="module" crossorigin src="./assets/index-BDX8o1Ld.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-Cq9iD15S.css"> <link rel="stylesheet" crossorigin href="./assets/index-CLsJV-0i.css">
</head> </head>
<body> <body>
<div id="root"></div> <div id="root"></div>

View File

@ -254,6 +254,8 @@ class PGKVStorage(BaseKVStorage):
db: PostgreSQLDB = field(default=None) db: PostgreSQLDB = field(default=None)
def __post_init__(self): def __post_init__(self):
namespace_prefix = self.global_config.get("namespace_prefix")
self.base_namespace = self.namespace.replace(namespace_prefix, "")
self._max_batch_size = self.global_config["embedding_batch_num"] self._max_batch_size = self.global_config["embedding_batch_num"]
async def initialize(self): async def initialize(self):
@ -269,7 +271,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_id(self, id: str) -> dict[str, Any] | None: async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get doc_full data by id.""" """Get doc_full data by id."""
sql = SQL_TEMPLATES["get_by_id_" + self.namespace] sql = SQL_TEMPLATES["get_by_id_" + self.base_namespace]
params = {"workspace": self.db.workspace, "id": id} params = {"workspace": self.db.workspace, "id": id}
if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
array_res = await self.db.query(sql, params, multirows=True) array_res = await self.db.query(sql, params, multirows=True)
@ -283,7 +285,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]:
"""Specifically for llm_response_cache.""" """Specifically for llm_response_cache."""
sql = SQL_TEMPLATES["get_by_mode_id_" + self.namespace] sql = SQL_TEMPLATES["get_by_mode_id_" + self.base_namespace]
params = {"workspace": self.db.workspace, mode: mode, "id": id} params = {"workspace": self.db.workspace, mode: mode, "id": id}
if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
array_res = await self.db.query(sql, params, multirows=True) array_res = await self.db.query(sql, params, multirows=True)
@ -297,7 +299,7 @@ class PGKVStorage(BaseKVStorage):
# Query by id # Query by id
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get doc_chunks data by id""" """Get doc_chunks data by id"""
sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( sql = SQL_TEMPLATES["get_by_ids_" + self.base_namespace].format(
ids=",".join([f"'{id}'" for id in ids]) ids=",".join([f"'{id}'" for id in ids])
) )
params = {"workspace": self.db.workspace} params = {"workspace": self.db.workspace}
@ -318,7 +320,7 @@ class PGKVStorage(BaseKVStorage):
async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]:
"""Specifically for llm_response_cache.""" """Specifically for llm_response_cache."""
SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] SQL = SQL_TEMPLATES["get_by_status_" + self.base_namespace]
params = {"workspace": self.db.workspace, "status": status} params = {"workspace": self.db.workspace, "status": status}
return await self.db.query(SQL, params, multirows=True) return await self.db.query(SQL, params, multirows=True)
@ -391,6 +393,8 @@ class PGVectorStorage(BaseVectorStorage):
def __post_init__(self): def __post_init__(self):
self._max_batch_size = self.global_config["embedding_batch_num"] self._max_batch_size = self.global_config["embedding_batch_num"]
namespace_prefix = self.global_config.get("namespace_prefix")
self.base_namespace = self.namespace.replace(namespace_prefix, "")
config = self.global_config.get("vector_db_storage_cls_kwargs", {}) config = self.global_config.get("vector_db_storage_cls_kwargs", {})
cosine_threshold = config.get("cosine_better_than_threshold") cosine_threshold = config.get("cosine_better_than_threshold")
if cosine_threshold is None: if cosine_threshold is None:
@ -493,7 +497,9 @@ class PGVectorStorage(BaseVectorStorage):
embedding = embeddings[0] embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding)) embedding_string = ",".join(map(str, embedding))
sql = SQL_TEMPLATES[self.namespace].format(embedding_string=embedding_string) sql = SQL_TEMPLATES[self.base_namespace].format(
embedding_string=embedding_string
)
params = { params = {
"workspace": self.db.workspace, "workspace": self.db.workspace,
"better_than_threshold": self.cosine_better_than_threshold, "better_than_threshold": self.cosine_better_than_threshold,

View File

@ -1,8 +1,8 @@
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import os
import configparser import configparser
import os
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
@ -41,11 +41,11 @@ from .utils import (
always_get_an_event_loop, always_get_an_event_loop,
compute_mdhash_id, compute_mdhash_id,
convert_response_to_json, convert_response_to_json,
encode_string_by_tiktoken,
lazy_external_import, lazy_external_import,
limit_async_func_call, limit_async_func_call,
logger, logger,
set_logger, set_logger,
encode_string_by_tiktoken,
) )
from .types import KnowledgeGraph from .types import KnowledgeGraph
@ -483,6 +483,7 @@ class LightRAG:
input: str | list[str], input: str | list[str],
split_by_character: str | None = None, split_by_character: str | None = None,
split_by_character_only: bool = False, split_by_character_only: bool = False,
ids: list[str] | None = None,
) -> None: ) -> None:
"""Sync Insert documents with checkpoint support """Sync Insert documents with checkpoint support
@ -491,10 +492,11 @@ class LightRAG:
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored. split_by_character is None, this parameter is ignored.
ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
""" """
loop = always_get_an_event_loop() loop = always_get_an_event_loop()
loop.run_until_complete( loop.run_until_complete(
self.ainsert(input, split_by_character, split_by_character_only) self.ainsert(input, split_by_character, split_by_character_only, ids)
) )
async def ainsert( async def ainsert(
@ -502,6 +504,7 @@ class LightRAG:
input: str | list[str], input: str | list[str],
split_by_character: str | None = None, split_by_character: str | None = None,
split_by_character_only: bool = False, split_by_character_only: bool = False,
ids: list[str] | None = None,
) -> None: ) -> None:
"""Async Insert documents with checkpoint support """Async Insert documents with checkpoint support
@ -510,8 +513,9 @@ class LightRAG:
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored. split_by_character is None, this parameter is ignored.
ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
""" """
await self.apipeline_enqueue_documents(input) await self.apipeline_enqueue_documents(input, ids)
await self.apipeline_process_enqueue_documents( await self.apipeline_process_enqueue_documents(
split_by_character, split_by_character_only split_by_character, split_by_character_only
) )
@ -533,7 +537,7 @@ class LightRAG:
doc_key = compute_mdhash_id(full_text, prefix="doc-") doc_key = compute_mdhash_id(full_text, prefix="doc-")
new_docs = {doc_key: {"content": full_text}} new_docs = {doc_key: {"content": full_text}}
_add_doc_keys = await self.full_docs.filter_keys(set(doc_key)) _add_doc_keys = await self.full_docs.filter_keys({doc_key})
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if not len(new_docs): if not len(new_docs):
logger.warning("This document is already in the storage.") logger.warning("This document is already in the storage.")
@ -572,24 +576,54 @@ class LightRAG:
if update_storage: if update_storage:
await self._insert_done() await self._insert_done()
async def apipeline_enqueue_documents(self, input: str | list[str]) -> None: async def apipeline_enqueue_documents(
self, input: str | list[str], ids: list[str] | None
) -> None:
""" """
Pipeline for Processing Documents Pipeline for Processing Documents
1. Remove duplicate contents from the list 1. Validate ids if provided or generate MD5 hash IDs
2. Generate document IDs and initial status 2. Remove duplicate contents
3. Filter out already processed documents 3. Generate document initial status
4. Enqueue document in status 4. Filter out already processed documents
5. Enqueue document in status
""" """
if isinstance(input, str): if isinstance(input, str):
input = [input] input = [input]
# Clean input text and remove duplicates # Clean input text and remove duplicates
unique_contents = list(set(self.clean_text(doc) for doc in input)) input = list(set(self.clean_text(doc) for doc in input))
# 2. Generate document IDs and initial status # 1. Validate ids if provided or generate MD5 hash IDs
if ids is not None:
# Check if the number of IDs matches the number of documents
if len(ids) != len(input):
raise ValueError("Number of IDs must match the number of documents")
# Check if IDs are unique
if len(ids) != len(set(ids)):
raise ValueError("IDs must be unique")
# Generate contents dict of IDs provided by user and documents
contents = {id_: doc for id_, doc in zip(ids, input)}
else:
# Generate contents dict of MD5 hash IDs and documents
contents = {
compute_mdhash_id(doc, prefix="doc-"): doc
for doc in input
}
# 2. Remove duplicate contents
unique_contents = {
id_: content
for content, id_ in {
content: id_ for id_, content in contents.items()
}.items()
}
# 3. Generate document initial status
new_docs: dict[str, Any] = { new_docs: dict[str, Any] = {
compute_mdhash_id(content, prefix="doc-"): { id_: {
"content": content, "content": content,
"content_summary": self._get_content_summary(content), "content_summary": self._get_content_summary(content),
"content_length": len(content), "content_length": len(content),
@ -597,10 +631,10 @@ class LightRAG:
"created_at": datetime.now().isoformat(), "created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat(), "updated_at": datetime.now().isoformat(),
} }
for content in unique_contents for id_, content in unique_contents.items()
} }
# 3. Filter out already processed documents # 4. Filter out already processed documents
# Get docs ids # Get docs ids
all_new_doc_ids = set(new_docs.keys()) all_new_doc_ids = set(new_docs.keys())
# Exclude IDs of documents that are already in progress # Exclude IDs of documents that are already in progress
@ -612,7 +646,7 @@ class LightRAG:
logger.info("No new unique documents were found.") logger.info("No new unique documents were found.")
return return
# 4. Store status document # 5. Store status document
await self.doc_status.upsert(new_docs) await self.doc_status.upsert(new_docs)
logger.info(f"Stored {len(new_docs)} new unique documents") logger.info(f"Stored {len(new_docs)} new unique documents")
@ -669,8 +703,6 @@ class LightRAG:
# 4. iterate over batch # 4. iterate over batch
for doc_id_processing_status in docs_batch: for doc_id_processing_status in docs_batch:
doc_id, status_doc = doc_id_processing_status doc_id, status_doc = doc_id_processing_status
# Update status in processing
doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-")
# Generate chunks from document # Generate chunks from document
chunks: dict[str, Any] = { chunks: dict[str, Any] = {
compute_mdhash_id(dp["content"], prefix="chunk-"): { compute_mdhash_id(dp["content"], prefix="chunk-"): {
@ -690,7 +722,7 @@ class LightRAG:
tasks = [ tasks = [
self.doc_status.upsert( self.doc_status.upsert(
{ {
doc_status_id: { doc_id: {
"status": DocStatus.PROCESSING, "status": DocStatus.PROCESSING,
"updated_at": datetime.now().isoformat(), "updated_at": datetime.now().isoformat(),
"content": status_doc.content, "content": status_doc.content,
@ -711,7 +743,7 @@ class LightRAG:
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
await self.doc_status.upsert( await self.doc_status.upsert(
{ {
doc_status_id: { doc_id: {
"status": DocStatus.PROCESSED, "status": DocStatus.PROCESSED,
"chunks_count": len(chunks), "chunks_count": len(chunks),
"content": status_doc.content, "content": status_doc.content,
@ -726,7 +758,7 @@ class LightRAG:
logger.error(f"Failed to process document {doc_id}: {str(e)}") logger.error(f"Failed to process document {doc_id}: {str(e)}")
await self.doc_status.upsert( await self.doc_status.upsert(
{ {
doc_status_id: { doc_id: {
"status": DocStatus.FAILED, "status": DocStatus.FAILED,
"error": str(e), "error": str(e),
"content": status_doc.content, "content": status_doc.content,

View File

@ -7,7 +7,10 @@
"dev": "bunx --bun vite", "dev": "bunx --bun vite",
"build": "bunx --bun vite build", "build": "bunx --bun vite build",
"lint": "eslint .", "lint": "eslint .",
"preview": "bunx --bun vite preview" "preview": "bunx --bun vite preview",
"dev-no-bun": "vite",
"build-no-bun": "vite build --emptyOutDir",
"preview-no-bun": "vite preview"
}, },
"dependencies": { "dependencies": {
"@faker-js/faker": "^9.5.0", "@faker-js/faker": "^9.5.0",

View File

@ -80,7 +80,7 @@ export default function UploadDocumentsDialog() {
<FileUploader <FileUploader
maxFileCount={Infinity} maxFileCount={Infinity}
maxSize={200 * 1024 * 1024} maxSize={200 * 1024 * 1024}
description="supported types: TXT, MD, DOC, PDF, PPTX" description="supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
onUpload={handleDocumentsUpload} onUpload={handleDocumentsUpload}
progresses={progresses} progresses={progresses}
disabled={isUploading} disabled={isUploading}

View File

@ -247,7 +247,7 @@ function FileUploader(props: FileUploaderProps) {
? ` ${maxFileCount === Infinity ? 'multiple' : maxFileCount} ? ` ${maxFileCount === Infinity ? 'multiple' : maxFileCount}
files (up to ${formatBytes(maxSize)} each)` files (up to ${formatBytes(maxSize)} each)`
: ` a file with ${formatBytes(maxSize)}`} : ` a file with ${formatBytes(maxSize)}`}
Supported formats: TXT, MD, DOC, PDF, PPTX Supported formats: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS
</p> </p>
)} )}
</div> </div>

View File

@ -26,7 +26,12 @@ export const defaultQueryLabel = '*'
// reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/MIME_types/Common_types // reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/MIME_types/Common_types
export const supportedFileTypes = { export const supportedFileTypes = {
'text/plain': ['.txt', '.md'], 'text/plain': [
'.txt', '.md', '.html', '.htm', '.tex', '.json', '.xml', '.yaml', '.yml',
'.rtf', '.odt', '.epub', '.csv', '.log', '.conf', '.ini', '.properties',
'.sql', '.bat', '.sh', '.c', '.cpp', '.py', '.java', '.js', '.ts',
'.swift', '.go', '.rb', '.php', '.css', '.scss', '.less'
],
'application/pdf': ['.pdf'], 'application/pdf': ['.pdf'],
'application/msword': ['.doc'], 'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],