Feat: add data source to pipleline logs . (#11075)

### What problem does this PR solve?

#10953

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-11-07 11:43:59 +08:00 committed by GitHub
parent 5629fbd2ca
commit 34283d4db4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 45 additions and 45 deletions

View File

@ -19,7 +19,7 @@ from flask import request
from flask_login import login_required, current_user from flask_login import login_required, current_user
from api.db import InputType from api.db import InputType
from api.db.services.connector_service import ConnectorService, Connector2KbService, SyncLogsService from api.db.services.connector_service import ConnectorService, SyncLogsService
from api.utils.api_utils import get_json_result, validate_request, get_data_error_result from api.utils.api_utils import get_json_result, validate_request, get_data_error_result
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from common.constants import RetCode, TaskStatus from common.constants import RetCode, TaskStatus
@ -88,14 +88,14 @@ def resume(connector_id):
return get_json_result(data=True) return get_json_result(data=True)
@manager.route("/<connector_id>/link", methods=["POST"]) # noqa: F821 @manager.route("/<connector_id>/rebuild", methods=["PUT"]) # noqa: F821
@validate_request("kb_ids")
@login_required @login_required
def link_kb(connector_id): @validate_request("kb_id")
def rebuild(connector_id):
req = request.json req = request.json
errors = Connector2KbService.link_kb(connector_id, req["kb_ids"], current_user.id) err = ConnectorService.rebuild(connector_id, req["kb_id"], current_user.id)
if errors: if err:
return get_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) return get_json_result(data=False, message=err, code=RetCode.SERVER_ERROR)
return get_json_result(data=True) return get_json_result(data=True)

View File

@ -260,6 +260,8 @@ def list_docs():
for doc_item in docs: for doc_item in docs:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}" doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
if doc_item.get("source_type"):
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
return get_json_result(data={"total": tol, "docs": docs}) return get_json_result(data={"total": tol, "docs": docs})
except Exception as e: except Exception as e:

View File

@ -1064,6 +1064,7 @@ class Connector2Kb(DataBaseModel):
id = CharField(max_length=32, primary_key=True) id = CharField(max_length=32, primary_key=True)
connector_id = CharField(max_length=32, null=False, index=True) connector_id = CharField(max_length=32, null=False, index=True)
kb_id = CharField(max_length=32, null=False, index=True) kb_id = CharField(max_length=32, null=False, index=True)
auto_parse = CharField(max_length=1, null=False, default="1", index=False)
class Meta: class Meta:
db_table = "connector2kb" db_table = "connector2kb"
@ -1282,4 +1283,8 @@ def migrate_db():
migrate(migrator.add_column("tenant_llm", "status", CharField(max_length=1, null=False, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True))) migrate(migrator.add_column("tenant_llm", "status", CharField(max_length=1, null=False, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)))
except Exception: except Exception:
pass pass
try:
migrate(migrator.add_column("connector2kb", "auto_parse", CharField(max_length=1, null=False, default="1", index=False)))
except Exception:
pass
logging.disable(logging.NOTSET) logging.disable(logging.NOTSET)

View File

@ -54,7 +54,6 @@ class ConnectorService(CommonService):
SyncLogsService.update_by_id(task["id"], task) SyncLogsService.update_by_id(task["id"], task)
ConnectorService.update_by_id(connector_id, {"status": status}) ConnectorService.update_by_id(connector_id, {"status": status})
@classmethod @classmethod
def list(cls, tenant_id): def list(cls, tenant_id):
fields = [ fields = [
@ -67,6 +66,15 @@ class ConnectorService(CommonService):
cls.model.tenant_id == tenant_id cls.model.tenant_id == tenant_id
).dicts()) ).dicts())
@classmethod
def rebuild(cls, kb_id:str, connector_id: str, tenant_id:str):
e, conn = cls.get_by_id(connector_id)
if not e:
return
SyncLogsService.filter_delete([SyncLogs.connector_id==connector_id, SyncLogs.kb_id==kb_id])
docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}")
return FileService.delete_docs([d.id for d in docs], tenant_id)
class SyncLogsService(CommonService): class SyncLogsService(CommonService):
model = SyncLogs model = SyncLogs
@ -91,6 +99,7 @@ class SyncLogsService(CommonService):
Connector.timeout_secs, Connector.timeout_secs,
Knowledgebase.name.alias("kb_name"), Knowledgebase.name.alias("kb_name"),
Knowledgebase.avatar.alias("kb_avatar"), Knowledgebase.avatar.alias("kb_avatar"),
Connector2Kb.auto_parse,
cls.model.from_beginning.alias("reindex"), cls.model.from_beginning.alias("reindex"),
cls.model.status cls.model.status
] ]
@ -179,7 +188,7 @@ class SyncLogsService(CommonService):
.where(cls.model.id == id).execute() .where(cls.model.id == id).execute()
@classmethod @classmethod
def duplicate_and_parse(cls, kb, docs, tenant_id, src): def duplicate_and_parse(cls, kb, docs, tenant_id, src, auto_parse=True):
if not docs: if not docs:
return None return None
@ -191,14 +200,17 @@ class SyncLogsService(CommonService):
return self.blob return self.blob
errs = [] errs = []
files = [FileObj(filename=d["semantic_identifier"]+f".{d['extension']}", blob=d["blob"]) for d in docs] files = [FileObj(filename=d["semantic_identifier"]+(f"{d['extension']}" if d["semantic_identifier"][::-1].find(d['extension'][::-1])<0 else ""), blob=d["blob"]) for d in docs]
doc_ids = [] doc_ids = []
err, doc_blob_pairs = FileService.upload_document(kb, files, tenant_id, src) err, doc_blob_pairs = FileService.upload_document(kb, files, tenant_id, src)
errs.extend(err) errs.extend(err)
kb_table_num_map = {} kb_table_num_map = {}
for doc, _ in doc_blob_pairs: for doc, _ in doc_blob_pairs:
DocumentService.run(tenant_id, doc, kb_table_num_map)
doc_ids.append(doc["id"]) doc_ids.append(doc["id"])
if not auto_parse or auto_parse == "0":
continue
DocumentService.run(tenant_id, doc, kb_table_num_map)
return errs, doc_ids return errs, doc_ids
@ -213,33 +225,6 @@ class SyncLogsService(CommonService):
class Connector2KbService(CommonService): class Connector2KbService(CommonService):
model = Connector2Kb model = Connector2Kb
@classmethod
def link_kb(cls, conn_id:str, kb_ids: list[str], tenant_id:str):
arr = cls.query(connector_id=conn_id)
old_kb_ids = [a.kb_id for a in arr]
for kb_id in kb_ids:
if kb_id in old_kb_ids:
continue
cls.save(**{
"id": get_uuid(),
"connector_id": conn_id,
"kb_id": kb_id
})
SyncLogsService.schedule(conn_id, kb_id, reindex=True)
errs = []
e, conn = ConnectorService.get_by_id(conn_id)
for kb_id in old_kb_ids:
if kb_id in kb_ids:
continue
cls.filter_delete([cls.model.kb_id==kb_id, cls.model.connector_id==conn_id])
SyncLogsService.filter_update([SyncLogs.connector_id==conn_id, SyncLogs.kb_id==kb_id, SyncLogs.status==TaskStatus.SCHEDULE], {"status": TaskStatus.CANCEL})
docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}")
err = FileService.delete_docs([d.id for d in docs], tenant_id)
if err:
errs.append(err)
return "\n".join(errs)
@classmethod @classmethod
def link_connectors(cls, kb_id:str, connector_ids: list[str], tenant_id:str): def link_connectors(cls, kb_id:str, connector_ids: list[str], tenant_id:str):
arr = cls.query(kb_id=kb_id) arr = cls.query(kb_id=kb_id)
@ -260,11 +245,15 @@ class Connector2KbService(CommonService):
continue continue
cls.filter_delete([cls.model.kb_id==kb_id, cls.model.connector_id==conn_id]) cls.filter_delete([cls.model.kb_id==kb_id, cls.model.connector_id==conn_id])
e, conn = ConnectorService.get_by_id(conn_id) e, conn = ConnectorService.get_by_id(conn_id)
SyncLogsService.filter_update([SyncLogs.connector_id==conn_id, SyncLogs.kb_id==kb_id, SyncLogs.status==TaskStatus.SCHEDULE], {"status": TaskStatus.CANCEL}) if not e:
docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}") continue
err = FileService.delete_docs([d.id for d in docs], tenant_id) #SyncLogsService.filter_delete([SyncLogs.connector_id==conn_id, SyncLogs.kb_id==kb_id])
if err: # Do not delete docs while unlinking.
errs.append(err) SyncLogsService.filter_update([SyncLogs.connector_id==conn_id, SyncLogs.kb_id==kb_id, SyncLogs.status.in_([TaskStatus.SCHEDULE, TaskStatus.RUNNING])], {"status": TaskStatus.CANCEL})
#docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}")
#err = FileService.delete_docs([d.id for d in docs], tenant_id)
#if err:
# errs.append(err)
return "\n".join(errs) return "\n".join(errs)
@classmethod @classmethod
@ -282,3 +271,5 @@ class Connector2KbService(CommonService):
).dicts() ).dicts()
) )

View File

@ -159,7 +159,7 @@ class PipelineOperationLogService(CommonService):
document_name=document.name, document_name=document.name,
document_suffix=document.suffix, document_suffix=document.suffix,
document_type=document.type, document_type=document.type,
source_from="", # TODO: add in the future source_from=document.source_type.split("/")[0],
progress=document.progress, progress=document.progress,
progress_msg=document.progress_msg, progress_msg=document.progress_msg,
process_begin_at=document.process_begin_at, process_begin_at=document.process_begin_at,

View File

@ -253,6 +253,8 @@ class NotionConnector(LoadConnector, PollConnector):
all_child_page_ids: list[str] = [] all_child_page_ids: list[str] = []
for page in pages: for page in pages:
if isinstance(page, dict):
page = NotionPage(**page)
if page.id in self.indexed_pages: if page.id in self.indexed_pages:
logging.debug(f"Already indexed page with ID '{page.id}'. Skipping.") logging.debug(f"Already indexed page with ID '{page.id}'. Skipping.")
continue continue

View File

@ -78,7 +78,7 @@ class SyncBase:
} for doc in document_batch] } for doc in document_batch]
e, kb = KnowledgebaseService.get_by_id(task["kb_id"]) e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
err, dids = SyncLogsService.duplicate_and_parse(kb, docs, task["tenant_id"], f"{self.SOURCE_NAME}/{task['connector_id']}") err, dids = SyncLogsService.duplicate_and_parse(kb, docs, task["tenant_id"], f"{self.SOURCE_NAME}/{task['connector_id']}", task["auto_parse"])
SyncLogsService.increase_docs(task["id"], min_update, max_update, len(docs), "\n".join(err), len(err)) SyncLogsService.increase_docs(task["id"], min_update, max_update, len(docs), "\n".join(err), len(err))
doc_num += len(docs) doc_num += len(docs)