diff --git a/.gitignore b/.gitignore index b445d56a8..ef393f31e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ Cargo.lock # Exclude the log folder docker/ragflow-logs/ +/flask_session +/logs diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 3c7b156c8..1ae6fef87 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -14,7 +14,6 @@ # limitations under the License # -import base64 import os import pathlib import re @@ -24,8 +23,10 @@ from elasticsearch_dsl import Q from flask import request from flask_login import login_required, current_user +from api.db.db_models import Task from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService +from api.db.services.task_service import TaskService, queue_tasks from rag.nlp import search from rag.utils.es_conn import ELASTICSEARCH from api.db.services import duplicate_name @@ -37,7 +38,9 @@ from api.db.services.document_service import DocumentService from api.settings import RetCode from api.utils.api_utils import get_json_result from rag.utils.minio_conn import MINIO +from rag.utils.redis_conn import REDIS_CONN from api.utils.file_utils import filename_type, thumbnail +from rag.settings import SVR_QUEUE_NAME @manager.route('/upload', methods=['POST']) @@ -277,6 +280,14 @@ def run(): return get_data_error_result(retmsg="Tenant not found!") ELASTICSEARCH.deleteByQuery( Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) + + if str(req["run"]) == TaskStatus.RUNNING.value: + TaskService.filter_delete([Task.doc_id == id]) + e, doc = DocumentService.get_by_id(id) + doc = doc.to_dict() + doc["tenant_id"] = tenant_id + bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) + queue_tasks(doc, bucket, name) return get_json_result(data=True) except Exception as e: diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index d3db21208..0e7a6e38c 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from peewee import Expression - +import random +from datetime import datetime from elasticsearch_dsl import Q -from api.utils import current_timestamp +from api.settings import stat_logger +from api.utils import current_timestamp, get_format_time from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO from rag.nlp import search from api.db import FileType, TaskStatus -from api.db.db_models import DB, Knowledgebase, Tenant +from api.db.db_models import DB, Knowledgebase, Tenant, Task from api.db.db_models import Document from api.db.services.common_service import CommonService from api.db.services.knowledgebase_service import KnowledgebaseService @@ -92,7 +93,7 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() - def get_newly_uploaded(cls, tm): + def get_newly_uploaded(cls): fields = [ cls.model.id, cls.model.kb_id, @@ -196,3 +197,55 @@ class DocumentService(CommonService): on=(Knowledgebase.id == cls.model.kb_id)).where( Knowledgebase.tenant_id == tenant_id) return len(docs) + + @classmethod + @DB.connection_context() + def begin2parse(cls, docid): + cls.update_by_id( + docid, {"progress": random.random() * 1 / 100., + "progress_msg": "Task dispatched...", + "process_begin_at": get_format_time() + }) + + @classmethod + @DB.connection_context() + def update_progress(cls): + docs = cls.get_unfinished_docs() + for d in docs: + try: + tsks = Task.query(doc_id=d["id"], order_by=Task.create_time) + if not tsks: + continue + msg = [] + prg = 0 + finished = True + bad = 0 + status = TaskStatus.RUNNING.value + for t in tsks: + if 0 <= t.progress < 1: + finished = False + prg += t.progress if t.progress >= 0 else 0 + msg.append(t.progress_msg) + if t.progress == -1: + bad += 1 + prg /= len(tsks) + if finished and bad: + prg = -1 + status = TaskStatus.FAIL.value + elif finished: + status = TaskStatus.DONE.value + + msg = "\n".join(msg) + info = { + "process_duation": datetime.timestamp( + datetime.now()) - + d["process_begin_at"].timestamp(), + "run": status} + if prg != 0: + info["progress"] = prg + if msg: + info["progress_msg"] = msg + cls.update_by_id(d["id"], info) + except Exception as e: + stat_logger.error("fetch task exception:" + str(e)) + diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 083847509..6d109e2dc 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -15,21 +15,24 @@ # import random -from peewee import Expression, JOIN +from api.db.db_utils import bulk_insert_into_db +from deepdoc.parser import PdfParser +from peewee import JOIN from api.db.db_models import DB, File2Document, File from api.db import StatusEnum, FileType, TaskStatus from api.db.db_models import Task, Document, Knowledgebase, Tenant from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService -from api.utils import current_timestamp +from api.utils import current_timestamp, get_uuid +from deepdoc.parser.excel_parser import RAGFlowExcelParser +from rag.settings import MINIO, SVR_QUEUE_NAME +from rag.utils.redis_conn import REDIS_CONN class TaskService(CommonService): model = Task - @classmethod - @DB.connection_context() - def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True): + def get_tasks(cls, task_id): fields = [ cls.model.id, cls.model.doc_id, @@ -48,28 +51,18 @@ class TaskService(CommonService): Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time] - with DB.lock("get_task", -1): - docs = cls.model.select(*fields) \ - .join(Document, on=(cls.model.doc_id == Document.id)) \ - .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ - .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ - .where( - Document.status == StatusEnum.VALID.value, - Document.run == TaskStatus.RUNNING.value, - ~(Document.type == FileType.VIRTUAL.value), - cls.model.progress == 0, - #cls.model.update_time >= tm, - #(Expression(cls.model.create_time, "%%", comm) == mod) - )\ - .order_by(cls.model.update_time.asc())\ - .paginate(0, items_per_page) - docs = list(docs.dicts()) - if not docs: return [] - if not takeit: return docs + docs = cls.model.select(*fields) \ + .join(Document, on=(cls.model.doc_id == Document.id)) \ + .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ + .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \ + .where(cls.model.id == task_id) + docs = list(docs.dicts()) + if not docs: return [] - cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where( - cls.model.id == docs[0]["id"]).execute() - return docs + cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", + progress=random.random() / 10.).where( + cls.model.id == docs[0]["id"]).execute() + return docs @classmethod @DB.connection_context() @@ -112,3 +105,55 @@ class TaskService(CommonService): if "progress" in info: cls.model.update(progress=info["progress"]).where( cls.model.id == id).execute() + + +def queue_tasks(doc, bucket, name): + def new_task(): + nonlocal doc + return { + "id": get_uuid(), + "doc_id": doc["id"] + } + tsks = [] + + if doc["type"] == FileType.PDF.value: + file_bin = MINIO.get(bucket, name) + do_layout = doc["parser_config"].get("layout_recognize", True) + pages = PdfParser.total_page_number(doc["name"], file_bin) + page_size = doc["parser_config"].get("task_page_size", 12) + if doc["parser_id"] == "paper": + page_size = doc["parser_config"].get("task_page_size", 22) + if doc["parser_id"] == "one": + page_size = 1000000000 + if not do_layout: + page_size = 1000000000 + page_ranges = doc["parser_config"].get("pages") + if not page_ranges: + page_ranges = [(1, 100000)] + for s, e in page_ranges: + s -= 1 + s = max(0, s) + e = min(e - 1, pages) + for p in range(s, e, page_size): + task = new_task() + task["from_page"] = p + task["to_page"] = min(p + page_size, e) + tsks.append(task) + + elif doc["parser_id"] == "table": + file_bin = MINIO.get(bucket, name) + rn = RAGFlowExcelParser.row_number( + doc["name"], file_bin) + for i in range(0, rn, 3000): + task = new_task() + task["from_page"] = i + task["to_page"] = min(i + 3000, rn) + tsks.append(task) + else: + tsks.append(new_task()) + + for t in tsks: + REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t) + + bulk_insert_into_db(Task, tsks, True) + DocumentService.begin2parse(doc["id"]) diff --git a/api/ragflow_server.py b/api/ragflow_server.py index 981cc1bb9..8878817a3 100644 --- a/api/ragflow_server.py +++ b/api/ragflow_server.py @@ -18,10 +18,14 @@ import logging import os import signal import sys +import time import traceback +from concurrent.futures import ThreadPoolExecutor + from werkzeug.serving import run_simple from api.apps import app from api.db.runtime_config import RuntimeConfig +from api.db.services.document_service import DocumentService from api.settings import ( HOST, HTTP_PORT, access_logger, database_logger, stat_logger, ) @@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db from api.db.init_data import init_web_data from api.versions import get_versions + +def update_progress(): + while True: + time.sleep(1) + try: + DocumentService.update_progress() + except Exception as e: + stat_logger.error("update_progress exception:" + str(e)) + + if __name__ == '__main__': print(""" ____ ______ __ @@ -71,6 +85,9 @@ if __name__ == '__main__': peewee_logger.addHandler(database_logger.handlers[0]) peewee_logger.setLevel(database_logger.level) + thr = ThreadPoolExecutor(max_workers=1) + thr.submit(update_progress) + # start http server try: stat_logger.info("RAG Flow http server start...") diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml index a277c7245..71139a3a3 100644 --- a/conf/service_conf.yaml +++ b/conf/service_conf.yaml @@ -15,6 +15,10 @@ minio: host: 'minio:9000' es: hosts: 'http://es01:9200' +redis: + db: 1 + password: 'infini_rag_flow' + host: 'redis:6379' user_default_llm: factory: 'Tongyi-Qianwen' api_key: 'sk-xxxxxxxxxxxxx' diff --git a/docker/.env b/docker/.env index 0fb4ed010..34d4d7096 100644 --- a/docker/.env +++ b/docker/.env @@ -25,6 +25,8 @@ MINIO_PORT=9000 MINIO_USER=rag_flow MINIO_PASSWORD=infini_rag_flow +REDIS_PASSWORD=infini_rag_flow + SVR_HTTP_PORT=9380 RAGFLOW_VERSION=latest diff --git a/docker/README.md b/docker/README.md index 63d435cb6..9f3ee5d74 100644 --- a/docker/README.md +++ b/docker/README.md @@ -50,7 +50,7 @@ The serving port of mysql inside the container. The modification should be synch The max database connection. ### stale_timeout -The timeout duation in seconds. +The timeout duration in seconds. ## minio diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml index 841c4c777..175740739 100644 --- a/docker/docker-compose-base.yml +++ b/docker/docker-compose-base.yml @@ -29,24 +29,6 @@ services: - ragflow restart: always - #kibana: - # depends_on: - # es01: - # condition: service_healthy - # image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - # container_name: ragflow-kibana - # volumes: - # - kibanadata:/usr/share/kibana/data - # ports: - # - ${KIBANA_PORT}:5601 - # environment: - # - SERVERNAME=kibana - # - ELASTICSEARCH_HOSTS=http://es01:9200 - # - TZ=${TIMEZONE} - # mem_limit: ${MEM_LIMIT} - # networks: - # - ragflow - mysql: image: mysql:5.7.18 container_name: ragflow-mysql @@ -74,7 +56,6 @@ services: retries: 3 restart: always - minio: image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z container_name: ragflow-minio @@ -92,16 +73,27 @@ services: - ragflow restart: always + redis: + image: redis:7.2.4 + container_name: ragflow-redis + command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru + volumes: + - redis_data:/data + networks: + - ragflow + restart: always + + volumes: esdata01: driver: local -# kibanadata: -# driver: local mysql_data: driver: local minio_data: driver: local + redis_data: + driver: local networks: ragflow: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index b957a51d3..c7a4dcce1 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -12,29 +12,14 @@ function task_exe(){ done } -function watch_broker(){ - while [ 1 -eq 1 ];do - C=`ps aux|grep "task_broker.py"|grep -v grep|wc -l`; - if [ $C -lt 1 ];then - $PY rag/svr/task_broker.py & - fi - sleep 5; - done -} - -function task_bro(){ - watch_broker; -} - -task_bro & - WS=1 for ((i=0;i= 0 else 0 - msg.append(t.progress_msg) - if t.progress == -1: - bad += 1 - prg /= len(tsks) - if finished and bad: - prg = -1 - status = TaskStatus.FAIL.value - elif finished: - status = TaskStatus.DONE.value - - msg = "\n".join(msg) - info = { - "process_duation": datetime.timestamp( - datetime.now()) - - d["process_begin_at"].timestamp(), - "run": status} - if prg != 0: - info["progress"] = prg - if msg: - info["progress_msg"] = msg - DocumentService.update_by_id(d["id"], info) - except Exception as e: - cron_logger.error("fetch task exception:" + str(e)) - - -if __name__ == "__main__": - peewee_logger = logging.getLogger('peewee') - peewee_logger.propagate = False - peewee_logger.addHandler(database_logger.handlers[0]) - peewee_logger.setLevel(database_logger.level) - # init db - init_web_db() - init_web_data() - - while True: - dispatch() - time.sleep(1) - update_progress() diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 981752973..a1db21067 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -28,7 +28,7 @@ from functools import partial from api.db.services.file2document_service import File2DocumentService from rag.utils.minio_conn import MINIO from api.db.db_models import close_connection -from rag.settings import database_logger +from rag.settings import database_logger, SVR_QUEUE_NAME from rag.settings import cron_logger, DOC_MAXIMUM_SIZE from multiprocessing import Pool import numpy as np @@ -93,20 +93,29 @@ def set_progress(task_id, from_page=0, to_page=-1, sys.exit() -def collect(comm, mod, tm): - tasks = TaskService.get_tasks(tm, mod, comm) - #print(tasks) - if len(tasks) == 0: - time.sleep(1) +def collect(): + try: + payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer") + if not payload: + time.sleep(1) + return pd.DataFrame() + except Exception as e: + cron_logger.error("Get task event from queue exception:" + str(e)) return pd.DataFrame() + + msg = payload.get_message() + payload.ack() + if not msg: return pd.DataFrame() + + if TaskService.do_cancel(msg["id"]): + return pd.DataFrame() + tasks = TaskService.get_tasks(msg["id"]) + assert tasks, "{} empty task!".format(msg["id"]) tasks = pd.DataFrame(tasks) - mtm = tasks["update_time"].max() - cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm)) return tasks def get_minio_binary(bucket, name): - global MINIO return MINIO.get(bucket, name) @@ -122,13 +131,10 @@ def build(row): row["from_page"], row["to_page"]) chunker = FACTORY[row["parser_id"].lower()] - pool = Pool(processes=1) try: st = timer() bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"]) - thr = pool.apply_async(get_minio_binary, args=(bucket, name)) - binary = thr.get(timeout=90) - pool.terminate() + binary = get_minio_binary(bucket, name) cron_logger.info( "From minio({}) {}/{}".format(timer()-st, row["location"], row["name"])) cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], @@ -147,7 +153,6 @@ def build(row): else: callback(-1, f"Internal server error: %s" % str(e).replace("'", "")) - pool.terminate() traceback.print_exc() cron_logger.error( @@ -238,20 +243,13 @@ def embedding(docs, mdl, parser_config={}, callback=None): return tk_count -def main(comm, mod): - tm_fnm = os.path.join( - get_project_base_directory(), - "rag/res", - f"{comm}-{mod}.tm") - tm = findMaxTm(tm_fnm) - rows = collect(comm, mod, tm) +def main(): + rows = collect() if len(rows) == 0: return - tmf = open(tm_fnm, "a+") for _, r in rows.iterrows(): callback = partial(set_progress, r["id"], r["from_page"], r["to_page"]) - #callback(random.random()/10., "Task has been received.") try: embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"]) except Exception as e: @@ -265,7 +263,6 @@ def main(comm, mod): if cks is None: continue if not cks: - tmf.write(str(r["update_time"]) + "\n") callback(1., "No chunk! Done!") continue # TODO: exception handler @@ -305,8 +302,6 @@ def main(comm, mod): "Chunk doc({}), token({}), chunks({}), elapsed:{}".format( r["id"], tk_count, len(cks), timer()-st)) - tmf.write(str(r["update_time"]) + "\n") - tmf.close() if __name__ == "__main__": @@ -315,8 +310,6 @@ if __name__ == "__main__": peewee_logger.addHandler(database_logger.handlers[0]) peewee_logger.setLevel(database_logger.level) - #from mpi4py import MPI - #comm = MPI.COMM_WORLD while True: - main(int(sys.argv[2]), int(sys.argv[1])) + main() close_connection() diff --git a/rag/utils/redis_conn.py b/rag/utils/redis_conn.py index 984ae1629..2f06ae92a 100644 --- a/rag/utils/redis_conn.py +++ b/rag/utils/redis_conn.py @@ -5,6 +5,27 @@ import logging from rag import settings from rag.utils import singleton + +class Payload: + def __init__(self, consumer, queue_name, group_name, msg_id, message): + self.__consumer = consumer + self.__queue_name = queue_name + self.__group_name = group_name + self.__msg_id = msg_id + self.__message = json.loads(message['message']) + + def ack(self): + try: + self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id) + return True + except Exception as e: + logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e)) + return False + + def get_message(self): + return self.__message + + @singleton class RedisDB: def __init__(self): @@ -17,7 +38,8 @@ class RedisDB: self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0], port=int(self.config.get("host", ":6379").split(":")[1]), db=int(self.config.get("db", 1)), - password=self.config["password"]) + password=self.config.get("password"), + decode_responses=True) except Exception as e: logging.warning("Redis can't be connected.") return self.REDIS @@ -70,5 +92,48 @@ class RedisDB: self.__open__() return False + def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool: + try: + payload = {"message": json.dumps(message)} + pipeline = self.REDIS.pipeline() + pipeline.xadd(queue, payload) + pipeline.expire(queue, exp) + pipeline.execute() + return True + except Exception as e: + logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e)) + return False -REDIS_CONN = RedisDB() \ No newline at end of file + def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload: + try: + group_info = self.REDIS.xinfo_groups(queue_name) + if not any(e["name"] == group_name for e in group_info): + self.REDIS.xgroup_create( + queue_name, + group_name, + id="$", + mkstream=True + ) + args = { + "groupname": group_name, + "consumername": consumer_name, + "count": 1, + "block": 10000, + "streams": {queue_name: msg_id}, + } + messages = self.REDIS.xreadgroup(**args) + if not messages: + return None + stream, element_list = messages[0] + msg_id, payload = element_list[0] + res = Payload(self.REDIS, queue_name, group_name, msg_id, payload) + return res + except Exception as e: + if 'key' in str(e): + pass + else: + logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e)) + return None + + +REDIS_CONN = RedisDB() diff --git a/requirements.txt b/requirements.txt index bbd677ac0..14914f382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -134,4 +134,4 @@ xxhash==3.4.1 yarl==1.9.4 zhipuai==2.0.1 BCEmbedding -loguru==0.7.2 \ No newline at end of file +loguru==0.7.2 diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 000000000..9e7a4c885 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,126 @@ +accelerate==0.27.2 +aiohttp==3.9.3 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.3.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +Aspose.Slides==24.2.0 +attrs==23.2.0 +blinker==1.7.0 +cachelib==0.12.0 +cachetools==5.3.3 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +coloredlogs==15.0.1 +cryptography==42.0.5 +dashscope==1.14.1 +datasets==2.17.1 +datrie==0.8.2 +demjson3==3.0.6 +dill==0.3.8 +distro==1.9.0 +elastic-transport==8.12.0 +elasticsearch==8.12.1 +elasticsearch-dsl==8.12.0 +et-xmlfile==1.1.0 +filelock==3.13.1 +fastembed==0.2.6 +FlagEmbedding==1.2.5 +Flask==3.0.2 +Flask-Cors==4.0.0 +Flask-Login==0.6.3 +Flask-Session==0.6.0 +flatbuffers==23.5.26 +frozenlist==1.4.1 +fsspec==2023.10.0 +h11==0.14.0 +hanziconv==0.3.2 +httpcore==1.0.4 +httpx==0.27.0 +huggingface-hub==0.20.3 +humanfriendly==10.0 +idna==3.6 +install==1.3.5 +itsdangerous==2.1.2 +Jinja2==3.1.3 +joblib==1.3.2 +lxml==5.1.0 +MarkupSafe==2.1.5 +minio==7.2.4 +mpi4py==3.1.5 +mpmath==1.3.0 +multidict==6.0.5 +multiprocess==0.70.16 +networkx==3.2.1 +nltk==3.8.1 +numpy==1.26.4 +openai==1.12.0 +opencv-python==4.9.0.80 +openpyxl==3.1.2 +packaging==23.2 +pandas==2.2.1 +pdfminer.six==20221105 +pdfplumber==0.10.4 +peewee==3.17.1 +pillow==10.2.0 +protobuf==4.25.3 +psutil==5.9.8 +pyarrow==15.0.0 +pyarrow-hotfix==0.6 +pyclipper==1.3.0.post5 +pycparser==2.21 +pycryptodome==3.20.0 +pycryptodome-test-vectors==1.0.14 +pycryptodomex==3.20.0 +pydantic==2.6.2 +pydantic_core==2.16.3 +PyJWT==2.8.0 +PyMuPDF==1.23.25 +PyMuPDFb==1.23.22 +PyMySQL==1.1.0 +PyPDF2==3.0.1 +pypdfium2==4.27.0 +python-dateutil==2.8.2 +python-docx==1.1.0 +python-dotenv==1.0.1 +python-pptx==0.6.23 +pytz==2024.1 +PyYAML==6.0.1 +regex==2023.12.25 +requests==2.31.0 +ruamel.yaml==0.18.6 +ruamel.yaml.clib==0.2.8 +safetensors==0.4.2 +scikit-learn==1.4.1.post1 +scipy==1.12.0 +sentence-transformers==2.4.0 +shapely==2.0.3 +six==1.16.0 +sniffio==1.3.1 +StrEnum==0.4.15 +sympy==1.12 +threadpoolctl==3.3.0 +tika==2.6.0 +tiktoken==0.6.0 +tokenizers==0.15.2 +torch==2.2.1 +tqdm==4.66.2 +transformers==4.38.1 +triton==2.2.0 +typing_extensions==4.10.0 +tzdata==2024.1 +urllib3==2.2.1 +Werkzeug==3.0.1 +xgboost==2.0.3 +XlsxWriter==3.2.0 +xpinyin==0.7.6 +xxhash==3.4.1 +yarl==1.9.4 +zhipuai==2.0.1 +BCEmbedding +loguru==0.7.2 +ollama==0.1.8 +redis==5.0.4