mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-07-03 23:19:27 +00:00
fix docker compose issue (#238)
### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/226)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
b4abbe5d93
commit
23b448cf96
@ -65,6 +65,11 @@ def upload():
|
|||||||
DocumentService.query,
|
DocumentService.query,
|
||||||
name=file.filename,
|
name=file.filename,
|
||||||
kb_id=kb.id)
|
kb_id=kb.id)
|
||||||
|
filetype = filename_type(filename)
|
||||||
|
if not filetype:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="This type of file has not been supported yet!")
|
||||||
|
|
||||||
location = filename
|
location = filename
|
||||||
while MINIO.obj_exist(kb_id, location):
|
while MINIO.obj_exist(kb_id, location):
|
||||||
location += "_"
|
location += "_"
|
||||||
|
@ -25,7 +25,7 @@ from api.utils.api_utils import server_error_response, validate_request
|
|||||||
from api.utils import get_uuid, get_format_time, decrypt, download_img
|
from api.utils import get_uuid, get_format_time, decrypt, download_img
|
||||||
from api.db import UserTenantRole, LLMType
|
from api.db import UserTenantRole, LLMType
|
||||||
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
||||||
LLM_FACTORY
|
LLM_FACTORY, LLM_BASE_URL
|
||||||
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
||||||
from api.settings import stat_logger
|
from api.settings import stat_logger
|
||||||
from api.utils.api_utils import get_json_result, cors_reponse
|
from api.utils.api_utils import get_json_result, cors_reponse
|
||||||
@ -220,7 +220,9 @@ def user_register(user_id, user):
|
|||||||
"llm_factory": LLM_FACTORY,
|
"llm_factory": LLM_FACTORY,
|
||||||
"llm_name": llm.llm_name,
|
"llm_name": llm.llm_name,
|
||||||
"model_type": llm.model_type,
|
"model_type": llm.model_type,
|
||||||
"api_key": API_KEY})
|
"api_key": API_KEY,
|
||||||
|
"base_url": LLM_BASE_URL
|
||||||
|
})
|
||||||
|
|
||||||
if not UserService.save(**user):
|
if not UserService.save(**user):
|
||||||
return
|
return
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
@ -21,7 +22,7 @@ from api.db.db_models import init_database_tables as init_web_db
|
|||||||
from api.db.services import UserService
|
from api.db.services import UserService
|
||||||
from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
|
from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
|
||||||
from api.db.services.user_service import TenantService, UserTenantService
|
from api.db.services.user_service import TenantService, UserTenantService
|
||||||
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY
|
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
||||||
|
|
||||||
|
|
||||||
def init_superuser():
|
def init_superuser():
|
||||||
@ -53,7 +54,7 @@ def init_superuser():
|
|||||||
for llm in LLMService.query(fid=LLM_FACTORY):
|
for llm in LLMService.query(fid=LLM_FACTORY):
|
||||||
tenant_llm.append(
|
tenant_llm.append(
|
||||||
{"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type,
|
{"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type,
|
||||||
"api_key": API_KEY})
|
"api_key": API_KEY, "base_url": LLM_BASE_URL})
|
||||||
|
|
||||||
if not UserService.save(**user_info):
|
if not UserService.save(**user_info):
|
||||||
print("\033[93m【ERROR】\033[0mcan't init admin.")
|
print("\033[93m【ERROR】\033[0mcan't init admin.")
|
||||||
@ -282,11 +283,8 @@ def init_llm_factory():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
"""
|
"""
|
||||||
modify service_config
|
|
||||||
drop table llm;
|
drop table llm;
|
||||||
drop table llm_factories;
|
drop table llm_factories;
|
||||||
update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
|
|
||||||
update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
|
|
||||||
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
|
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
|
||||||
alter table knowledgebase modify avatar longtext;
|
alter table knowledgebase modify avatar longtext;
|
||||||
alter table user modify avatar longtext;
|
alter table user modify avatar longtext;
|
||||||
|
@ -91,6 +91,8 @@ default_llm = {
|
|||||||
}
|
}
|
||||||
LLM = get_base_config("user_default_llm", {})
|
LLM = get_base_config("user_default_llm", {})
|
||||||
LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
|
LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
|
||||||
|
LLM_BASE_URL = LLM.get("base_url")
|
||||||
|
|
||||||
if LLM_FACTORY not in default_llm:
|
if LLM_FACTORY not in default_llm:
|
||||||
print(
|
print(
|
||||||
"\33[91m【ERROR】\33[0m:",
|
"\33[91m【ERROR】\33[0m:",
|
||||||
|
@ -1,99 +1,12 @@
|
|||||||
version: '2.2'
|
version: '2.2'
|
||||||
|
|
||||||
|
|
||||||
|
include:
|
||||||
|
- path: ./docker-compose-base.yml
|
||||||
|
env_file: ./.env
|
||||||
|
|
||||||
services:
|
services:
|
||||||
es01:
|
ragflow:
|
||||||
container_name: ragflow-es-01
|
|
||||||
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
|
||||||
volumes:
|
|
||||||
- esdata01:/usr/share/elasticsearch/data
|
|
||||||
ports:
|
|
||||||
- ${ES_PORT}:9200
|
|
||||||
environment:
|
|
||||||
- node.name=es01
|
|
||||||
- cluster.name=${CLUSTER_NAME}
|
|
||||||
- cluster.initial_master_nodes=es01
|
|
||||||
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
|
||||||
- bootstrap.memory_lock=false
|
|
||||||
- xpack.security.enabled=false
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
mem_limit: ${MEM_LIMIT}
|
|
||||||
ulimits:
|
|
||||||
memlock:
|
|
||||||
soft: -1
|
|
||||||
hard: -1
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "curl http://localhost:9200"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 120
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
kibana:
|
|
||||||
depends_on:
|
|
||||||
es01:
|
|
||||||
condition: service_healthy
|
|
||||||
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
|
||||||
container_name: ragflow-kibana
|
|
||||||
volumes:
|
|
||||||
- kibanadata:/usr/share/kibana/data
|
|
||||||
ports:
|
|
||||||
- ${KIBANA_PORT}:5601
|
|
||||||
environment:
|
|
||||||
- SERVERNAME=kibana
|
|
||||||
- ELASTICSEARCH_HOSTS=http://es01:9200
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
mem_limit: ${MEM_LIMIT}
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
|
|
||||||
mysql:
|
|
||||||
image: mysql:5.7.18
|
|
||||||
container_name: ragflow-mysql
|
|
||||||
environment:
|
|
||||||
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
command:
|
|
||||||
--max_connections=1000
|
|
||||||
--character-set-server=utf8mb4
|
|
||||||
--collation-server=utf8mb4_general_ci
|
|
||||||
--default-authentication-plugin=mysql_native_password
|
|
||||||
--tls_version="TLSv1.2,TLSv1.3"
|
|
||||||
--init-file /data/application/init.sql
|
|
||||||
ports:
|
|
||||||
- ${MYSQL_PORT}:3306
|
|
||||||
volumes:
|
|
||||||
- mysql_data:/var/lib/mysql
|
|
||||||
- ./init.sql:/data/application/init.sql
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
|
|
||||||
minio:
|
|
||||||
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
|
||||||
container_name: ragflow-minio
|
|
||||||
command: server --console-address ":9001" /data
|
|
||||||
ports:
|
|
||||||
- 9000:9000
|
|
||||||
- 9001:9001
|
|
||||||
environment:
|
|
||||||
- MINIO_ROOT_USER=${MINIO_USER}
|
|
||||||
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
volumes:
|
|
||||||
- minio_data:/data
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
|
|
||||||
ragflow:
|
|
||||||
depends_on:
|
depends_on:
|
||||||
mysql:
|
mysql:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@ -116,18 +29,3 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
esdata01:
|
|
||||||
driver: local
|
|
||||||
kibanadata:
|
|
||||||
driver: local
|
|
||||||
mysql_data:
|
|
||||||
driver: local
|
|
||||||
minio_data:
|
|
||||||
driver: local
|
|
||||||
|
|
||||||
networks:
|
|
||||||
ragflow:
|
|
||||||
driver: bridge
|
|
||||||
|
110
docker/docker-compose-base.yml
Normal file
110
docker/docker-compose-base.yml
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
version: '2.2'
|
||||||
|
|
||||||
|
services:
|
||||||
|
es01:
|
||||||
|
container_name: ragflow-es-01
|
||||||
|
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
||||||
|
volumes:
|
||||||
|
- esdata01:/usr/share/elasticsearch/data
|
||||||
|
ports:
|
||||||
|
- ${ES_PORT}:9200
|
||||||
|
environment:
|
||||||
|
- node.name=es01
|
||||||
|
- cluster.name=${CLUSTER_NAME}
|
||||||
|
- cluster.initial_master_nodes=es01
|
||||||
|
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
||||||
|
- bootstrap.memory_lock=false
|
||||||
|
- xpack.security.enabled=false
|
||||||
|
- cluster.max_shards_per_node=4096
|
||||||
|
- TZ=${TIMEZONE}
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
ulimits:
|
||||||
|
memlock:
|
||||||
|
soft: -1
|
||||||
|
hard: -1
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl http://localhost:9200"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 120
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
depends_on:
|
||||||
|
es01:
|
||||||
|
condition: service_healthy
|
||||||
|
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
||||||
|
container_name: ragflow-kibana
|
||||||
|
volumes:
|
||||||
|
- kibanadata:/usr/share/kibana/data
|
||||||
|
ports:
|
||||||
|
- ${KIBANA_PORT}:5601
|
||||||
|
environment:
|
||||||
|
- SERVERNAME=kibana
|
||||||
|
- ELASTICSEARCH_HOSTS=http://es01:9200
|
||||||
|
- TZ=${TIMEZONE}
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
|
||||||
|
mysql:
|
||||||
|
image: mysql:5.7.18
|
||||||
|
container_name: ragflow-mysql
|
||||||
|
environment:
|
||||||
|
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
||||||
|
- TZ=${TIMEZONE}
|
||||||
|
command:
|
||||||
|
--max_connections=1000
|
||||||
|
--character-set-server=utf8mb4
|
||||||
|
--collation-server=utf8mb4_general_ci
|
||||||
|
--default-authentication-plugin=mysql_native_password
|
||||||
|
--tls_version="TLSv1.2,TLSv1.3"
|
||||||
|
--init-file /data/application/init.sql
|
||||||
|
ports:
|
||||||
|
- ${MYSQL_PORT}:3306
|
||||||
|
volumes:
|
||||||
|
- mysql_data:/var/lib/mysql
|
||||||
|
- ./init.sql:/data/application/init.sql
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
minio:
|
||||||
|
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
||||||
|
container_name: ragflow-minio
|
||||||
|
command: server --console-address ":9001" /data
|
||||||
|
ports:
|
||||||
|
- 9000:9000
|
||||||
|
- 9001:9001
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=${MINIO_USER}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
|
||||||
|
- TZ=${TIMEZONE}
|
||||||
|
volumes:
|
||||||
|
- minio_data:/data
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
esdata01:
|
||||||
|
driver: local
|
||||||
|
kibanadata:
|
||||||
|
driver: local
|
||||||
|
mysql_data:
|
||||||
|
driver: local
|
||||||
|
minio_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
ragflow:
|
||||||
|
driver: bridge
|
@ -1,98 +1,10 @@
|
|||||||
version: '2.2'
|
version: '2.2'
|
||||||
|
|
||||||
|
include:
|
||||||
|
- path: ./docker-compose-base.yml
|
||||||
|
env_file: ./.env
|
||||||
|
|
||||||
services:
|
services:
|
||||||
es01:
|
|
||||||
container_name: ragflow-es-01
|
|
||||||
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
|
||||||
volumes:
|
|
||||||
- esdata01:/usr/share/elasticsearch/data
|
|
||||||
ports:
|
|
||||||
- ${ES_PORT}:9200
|
|
||||||
environment:
|
|
||||||
- node.name=es01
|
|
||||||
- cluster.name=${CLUSTER_NAME}
|
|
||||||
- cluster.initial_master_nodes=es01
|
|
||||||
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
|
||||||
- bootstrap.memory_lock=false
|
|
||||||
- xpack.security.enabled=false
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
mem_limit: ${MEM_LIMIT}
|
|
||||||
ulimits:
|
|
||||||
memlock:
|
|
||||||
soft: -1
|
|
||||||
hard: -1
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "curl http://localhost:9200"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 120
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
kibana:
|
|
||||||
depends_on:
|
|
||||||
es01:
|
|
||||||
condition: service_healthy
|
|
||||||
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
|
||||||
container_name: ragflow-kibana
|
|
||||||
volumes:
|
|
||||||
- kibanadata:/usr/share/kibana/data
|
|
||||||
ports:
|
|
||||||
- ${KIBANA_PORT}:5601
|
|
||||||
environment:
|
|
||||||
- SERVERNAME=kibana
|
|
||||||
- ELASTICSEARCH_HOSTS=http://es01:9200
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
mem_limit: ${MEM_LIMIT}
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
|
|
||||||
mysql:
|
|
||||||
image: mysql:5.7.18
|
|
||||||
container_name: ragflow-mysql
|
|
||||||
environment:
|
|
||||||
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
command:
|
|
||||||
--max_connections=1000
|
|
||||||
--character-set-server=utf8mb4
|
|
||||||
--collation-server=utf8mb4_general_ci
|
|
||||||
--default-authentication-plugin=mysql_native_password
|
|
||||||
--tls_version="TLSv1.2,TLSv1.3"
|
|
||||||
--init-file /data/application/init.sql
|
|
||||||
ports:
|
|
||||||
- ${MYSQL_PORT}:3306
|
|
||||||
volumes:
|
|
||||||
- mysql_data:/var/lib/mysql
|
|
||||||
- ./init.sql:/data/application/init.sql
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
|
|
||||||
minio:
|
|
||||||
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
|
||||||
container_name: ragflow-minio
|
|
||||||
command: server --console-address ":9001" /data
|
|
||||||
ports:
|
|
||||||
- 9000:9000
|
|
||||||
- 9001:9001
|
|
||||||
environment:
|
|
||||||
- MINIO_ROOT_USER=${MINIO_USER}
|
|
||||||
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
|
|
||||||
- TZ=${TIMEZONE}
|
|
||||||
volumes:
|
|
||||||
- minio_data:/data
|
|
||||||
networks:
|
|
||||||
- ragflow
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
|
|
||||||
ragflow:
|
ragflow:
|
||||||
depends_on:
|
depends_on:
|
||||||
mysql:
|
mysql:
|
||||||
@ -107,6 +19,7 @@ services:
|
|||||||
- 443:443
|
- 443:443
|
||||||
volumes:
|
volumes:
|
||||||
- ./service_conf.yaml:/ragflow/conf/service_conf.yaml
|
- ./service_conf.yaml:/ragflow/conf/service_conf.yaml
|
||||||
|
- ./entrypoint.sh:/ragflow/entrypoint.sh
|
||||||
- ./ragflow-logs:/ragflow/logs
|
- ./ragflow-logs:/ragflow/logs
|
||||||
- ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
|
- ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
|
||||||
- ./nginx/proxy.conf:/etc/nginx/proxy.conf
|
- ./nginx/proxy.conf:/etc/nginx/proxy.conf
|
||||||
@ -116,18 +29,3 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
esdata01:
|
|
||||||
driver: local
|
|
||||||
kibanadata:
|
|
||||||
driver: local
|
|
||||||
mysql_data:
|
|
||||||
driver: local
|
|
||||||
minio_data:
|
|
||||||
driver: local
|
|
||||||
|
|
||||||
networks:
|
|
||||||
ragflow:
|
|
||||||
driver: bridge
|
|
||||||
|
@ -23,7 +23,7 @@ function watch_broker(){
|
|||||||
}
|
}
|
||||||
|
|
||||||
function task_bro(){
|
function task_bro(){
|
||||||
sleep 60;
|
sleep 160;
|
||||||
watch_broker;
|
watch_broker;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ es:
|
|||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: 'Tongyi-Qianwen'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
|
base_url: ''
|
||||||
oauth:
|
oauth:
|
||||||
github:
|
github:
|
||||||
client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
|
client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
@ -10,14 +10,59 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import copy
|
from io import BytesIO
|
||||||
|
from docx import Document
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser.pdf_parser import PlainParser
|
from deepdoc.parser.pdf_parser import PlainParser
|
||||||
from rag.app import laws
|
from rag.app import laws
|
||||||
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
|
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, ExcelParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
|
class Docx(DocxParser):
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __clean(self, line):
|
||||||
|
line = re.sub(r"\u3000", " ", line).strip()
|
||||||
|
return line
|
||||||
|
|
||||||
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||||
|
self.doc = Document(
|
||||||
|
filename) if not binary else Document(BytesIO(binary))
|
||||||
|
pn = 0
|
||||||
|
lines = []
|
||||||
|
for p in self.doc.paragraphs:
|
||||||
|
if pn > to_page:
|
||||||
|
break
|
||||||
|
if from_page <= pn < to_page and p.text.strip():
|
||||||
|
lines.append(self.__clean(p.text))
|
||||||
|
for run in p.runs:
|
||||||
|
if 'lastRenderedPageBreak' in run._element.xml:
|
||||||
|
pn += 1
|
||||||
|
continue
|
||||||
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||||
|
pn += 1
|
||||||
|
tbls = []
|
||||||
|
for tb in self.doc.tables:
|
||||||
|
html= "<table>"
|
||||||
|
for r in tb.rows:
|
||||||
|
html += "<tr>"
|
||||||
|
i = 0
|
||||||
|
while i < len(r.cells):
|
||||||
|
span = 1
|
||||||
|
c = r.cells[i]
|
||||||
|
for j in range(i+1, len(r.cells)):
|
||||||
|
if c.text == r.cells[j].text:
|
||||||
|
span += 1
|
||||||
|
i = j
|
||||||
|
i += 1
|
||||||
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
|
||||||
|
html += "</tr>"
|
||||||
|
html += "</table>"
|
||||||
|
tbls.append(((None, html), ""))
|
||||||
|
return [(l, "") for l in lines if l], tbls
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
sections = []
|
sections = []
|
||||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
for txt in laws.Docx()(filename, binary):
|
sections, tbls = Docx()(filename, binary)
|
||||||
sections.append((txt, ""))
|
res = tokenize_table(tbls, doc, eng)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
|
@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
continue
|
continue
|
||||||
if not str(row[clmns[j]]):
|
if not str(row[clmns[j]]):
|
||||||
continue
|
continue
|
||||||
if pd.isna(row[clmns[j]]):
|
#if pd.isna(row[clmns[j]]):
|
||||||
continue
|
# continue
|
||||||
fld = clmns_map[j][0]
|
fld = clmns_map[j][0]
|
||||||
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
||||||
row[clmns[j]])
|
row[clmns[j]])
|
||||||
|
@ -170,3 +170,4 @@ class LocalLLM(Base):
|
|||||||
return ans, num_tokens_from_string(ans)
|
return ans, num_tokens_from_string(ans)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
@ -68,6 +68,7 @@ def bullets_category(sections):
|
|||||||
|
|
||||||
def is_english(texts):
|
def is_english(texts):
|
||||||
eng = 0
|
eng = 0
|
||||||
|
if not texts: return False
|
||||||
for t in texts:
|
for t in texts:
|
||||||
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
||||||
eng += 1
|
eng += 1
|
||||||
@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
tokenize(d, rows, eng)
|
tokenize(d, rows, eng)
|
||||||
d["content_with_weight"] = rows
|
d["content_with_weight"] = rows
|
||||||
d["image"] = img
|
if img: d["image"] = img
|
||||||
add_positions(d, poss)
|
if poss: add_positions(d, poss)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
continue
|
continue
|
||||||
de = "; " if eng else "; "
|
de = "; " if eng else "; "
|
||||||
|
@ -46,7 +46,7 @@ class Dealer:
|
|||||||
"k": topk,
|
"k": topk,
|
||||||
"similarity": sim,
|
"similarity": sim,
|
||||||
"num_candidates": topk * 2,
|
"num_candidates": topk * 2,
|
||||||
"query_vector": qv
|
"query_vector": list(qv)
|
||||||
}
|
}
|
||||||
|
|
||||||
def search(self, req, idxnm, emb_mdl=None):
|
def search(self, req, idxnm, emb_mdl=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user