change callback strategy, add timezone to docker (#96)

This commit is contained in:
KevinHuSh 2024-03-05 12:08:41 +08:00 committed by GitHub
parent 59d8442d0d
commit 8a57f2afd5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 101 additions and 53 deletions

1
.gitignore vendored
View File

@ -20,5 +20,4 @@ Cargo.lock
*.trie *.trie
.idea/ .idea/
.env
.vscode/ .vscode/

View File

@ -141,7 +141,7 @@ def list():
try: try:
docs, tol = DocumentService.get_by_kb_id( docs, tol = DocumentService.get_by_kb_id(
kb_id, page_number, items_per_page, orderby, desc, keywords) kb_id, page_number, items_per_page, orderby, desc, keywords)
return get_json_result(data={"total":tol, "docs": docs}) return get_json_result(data={"total": tol, "docs": docs})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@ -217,7 +217,7 @@ def rm():
return get_data_error_result(retmsg="Tenant not found!") return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0) DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
if not DocumentService.delete_by_id(req["doc_id"]): if not DocumentService.delete_by_id(req["doc_id"]):
return get_data_error_result( return get_data_error_result(
retmsg="Database error (Document removal)!") retmsg="Database error (Document removal)!")
@ -241,7 +241,7 @@ def run():
info["chunk_num"] = 0 info["chunk_num"] = 0
info["token_num"] = 0 info["token_num"] = 0
DocumentService.update_by_id(id, info) DocumentService.update_by_id(id, info)
#if str(req["run"]) == TaskStatus.CANCEL.value: # if str(req["run"]) == TaskStatus.CANCEL.value:
tenant_id = DocumentService.get_tenant_id(id) tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id: if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!") return get_data_error_result(retmsg="Tenant not found!")
@ -281,7 +281,7 @@ def rename():
@manager.route('/get/<doc_id>', methods=['GET']) @manager.route('/get/<doc_id>', methods=['GET'])
#@login_required # @login_required
def get(doc_id): def get(doc_id):
try: try:
e, doc = DocumentService.get_by_id(doc_id) e, doc = DocumentService.get_by_id(doc_id)
@ -292,8 +292,9 @@ def get(doc_id):
ext = re.search(r"\.([^.]+)$", doc.name) ext = re.search(r"\.([^.]+)$", doc.name)
if ext: if ext:
if doc.type == FileType.VISUAL.value: if doc.type == FileType.VISUAL.value:
response.headers.set('Content-Type', 'image/%s'%ext.group(1)) response.headers.set('Content-Type', 'image/%s' % ext.group(1))
else: response.headers.set('Content-Type', 'application/%s'%ext.group(1)) else:
response.headers.set('Content-Type', 'application/%s' % ext.group(1))
return response return response
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@ -314,11 +315,14 @@ def change_parser():
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
return get_data_error_result(retmsg="Not supported yet!") return get_data_error_result(retmsg="Not supported yet!")
e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": "0"}) e = DocumentService.update_by_id(doc.id,
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0",
"token_num": 0, "chunk_num": 0, "process_duation": 0})
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if doc.token_num>0: if doc.token_num > 0:
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
doc.process_duation * -1)
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["doc_id"]) tenant_id = DocumentService.get_tenant_id(req["doc_id"])
@ -332,7 +336,7 @@ def change_parser():
@manager.route('/image/<image_id>', methods=['GET']) @manager.route('/image/<image_id>', methods=['GET'])
#@login_required # @login_required
def get_image(image_id): def get_image(image_id):
try: try:
bkt, nm = image_id.split("-") bkt, nm = image_id.split("-")
@ -341,4 +345,3 @@ def get_image(image_id):
return response return response
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)

View File

@ -348,6 +348,9 @@ class HuParser:
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i) bxs.pop(i)
continue continue
if not b["text"].strip():
bxs.pop(i)
continue
concatting_feats = [ concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-", b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
@ -856,7 +859,7 @@ class HuParser:
pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf") pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
return len(pdf) return len(pdf)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299): def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
self.lefted_chars = [] self.lefted_chars = []
self.mean_height = [] self.mean_height = []
self.mean_width = [] self.mean_width = []
@ -917,6 +920,7 @@ class HuParser:
# self.page_cum_height.append( # self.page_cum_height.append(
# np.max([c["bottom"] for c in chars])) # np.max([c["bottom"] for c in chars]))
self.__ocr(i + 1, img, chars, zoomin) self.__ocr(i + 1, img, chars, zoomin)
if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
bxes = [b for bxs in self.boxes for b in bxs] bxes = [b for bxs in self.boxes for b in bxs]

View File

@ -16,11 +16,13 @@ MEM_LIMIT=4073741824
MYSQL_PASSWORD=infini_rag_flow MYSQL_PASSWORD=infini_rag_flow
MYSQL_PORT=5455 MYSQL_PORT=5455
MINIO_USER=rag_flow MINIO_USER=infiniflow
MINIO_PASSWORD=infini_rag_flow MINIO_PASSWORD=infini_rag_flow
SVR_HTTP_PORT=9380 SVR_HTTP_PORT=9380
TIMEZONE='Asia/Shanghai'
######## OS setup for ES ########### ######## OS setup for ES ###########
# sysctl vm.max_map_count # sysctl vm.max_map_count
# sudo sysctl -w vm.max_map_count=262144 # sudo sysctl -w vm.max_map_count=262144

View File

@ -14,6 +14,7 @@ services:
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD} - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=false - bootstrap.memory_lock=false
- xpack.security.enabled=false - xpack.security.enabled=false
- TZ=${TIMEZONE}
mem_limit: ${MEM_LIMIT} mem_limit: ${MEM_LIMIT}
ulimits: ulimits:
memlock: memlock:
@ -41,6 +42,7 @@ services:
environment: environment:
- SERVERNAME=kibana - SERVERNAME=kibana
- ELASTICSEARCH_HOSTS=http://es01:9200 - ELASTICSEARCH_HOSTS=http://es01:9200
- TZ=${TIMEZONE}
mem_limit: ${MEM_LIMIT} mem_limit: ${MEM_LIMIT}
networks: networks:
- ragflow - ragflow
@ -50,7 +52,7 @@ services:
container_name: ragflow-mysql container_name: ragflow-mysql
environment: environment:
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD} - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
- TZ="Asia/Shanghai" - TZ=${TIMEZONE}
command: command:
--max_connections=1000 --max_connections=1000
--character-set-server=utf8mb4 --character-set-server=utf8mb4
@ -83,6 +85,7 @@ services:
environment: environment:
- MINIO_ROOT_USER=${MINIO_USER} - MINIO_ROOT_USER=${MINIO_USER}
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD} - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
- TZ=${TIMEZONE}
volumes: volumes:
- minio_data:/data - minio_data:/data
networks: networks:
@ -108,6 +111,8 @@ services:
- ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
- ./nginx/proxy.conf:/etc/nginx/proxy.conf - ./nginx/proxy.conf:/etc/nginx/proxy.conf
- ./nginx/nginx.conf:/etc/nginx/nginx.conf - ./nginx/nginx.conf:/etc/nginx/nginx.conf
environment:
- TZ=${TIMEZONE}
networks: networks:
- ragflow - ragflow
restart: always restart: always

View File

@ -26,26 +26,27 @@ class Pdf(PdfParser):
filename if not binary else binary, filename if not binary else binary,
zoomin, zoomin,
from_page, from_page,
to_page) to_page,
callback(0.1, "OCR finished") callback)
callback("OCR finished")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.47, "Layout analysis finished") callback(0.67, "Layout analysis finished")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished") callback(0.68, "Table analysis finished")
self._text_merge() self._text_merge()
self._concat_downward(concat_between_pages=False) tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._filter_forpages() self._filter_forpages()
self._merge_with_same_bullet() self._merge_with_same_bullet()
callback(0.75, "Text merging finished.") callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, True, True)
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls, tbl_poss return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@ -92,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)]) bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
if bull >= 0: cks = hierarchical_merge(bull, sections, 3) if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
else: else:
sections = [s.split("@") for s in sections] sections = [s.split("@") for s,_ in sections]
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2] sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?")) cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
@ -116,6 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(a, b): def dummy(prog=None, msg=""):
pass pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -54,13 +54,15 @@ class Pdf(PdfParser):
filename if not binary else binary, filename if not binary else binary,
zoomin, zoomin,
from_page, from_page,
to_page) to_page,
callback(0.1, "OCR finished") callback
)
callback("OCR finished")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.77, "Layout analysis finished") callback(0.67, "Layout analysis finished")
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1))) cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
self._naive_vertical_merge() self._naive_vertical_merge()

View File

@ -19,20 +19,22 @@ class Pdf(PdfParser):
filename if not binary else binary, filename if not binary else binary,
zoomin, zoomin,
from_page, from_page,
to_page) to_page,
callback(0.2, "OCR finished.") callback
)
callback("OCR finished.")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.5, "Layout analysis finished.") callback(0.65, "Layout analysis finished.")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.7, "Table analysis finished.") callback(0.67, "Table analysis finished.")
self._text_merge() self._text_merge()
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback(0.77, "Text merging finished") callback(0.68, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True) tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess # clean mess

View File

@ -26,24 +26,24 @@ class Pdf(PdfParser):
filename if not binary else binary, filename if not binary else binary,
zoomin, zoomin,
from_page, from_page,
to_page) to_page,
callback(0.1, "OCR finished") callback
)
callback("OCR finished")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.5, "Layout analysis finished.") callback(0.63, "Layout analysis finished.")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.7, "Table analysis finished.") callback(0.65, "Table analysis finished.")
self._text_merge() self._text_merge()
self._concat_downward(concat_between_pages=False) callback(0.67, "Text merging finished")
self._filter_forpages()
callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True) tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
#self._naive_vertical_merge()
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls

View File

@ -33,13 +33,15 @@ class Pdf(PdfParser):
filename if not binary else binary, filename if not binary else binary,
zoomin, zoomin,
from_page, from_page,
to_page) to_page,
callback(0.2, "OCR finished.") callback
)
callback("OCR finished.")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.47, "Layout analysis finished") callback(0.63, "Layout analysis finished")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished") callback(0.68, "Table analysis finished")

View File

@ -49,7 +49,7 @@ class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...") callback(msg="OCR is running...")
self.__images__(filename if not binary else binary, zoomin, from_page, to_page) self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page))) callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
res = [] res = []

View File

@ -56,6 +56,7 @@ class HuEmbedding(Base):
def encode(self, texts: list, batch_size=32): def encode(self, texts: list, batch_size=32):
texts = [t[:2000] for t in texts]
token_count = 0 token_count = 0
for t in texts: token_count += num_tokens_from_string(t) for t in texts: token_count += num_tokens_from_string(t)
res = [] res = []

View File

@ -114,6 +114,7 @@ def add_positions(d, poss):
d["page_num_int"].append(pn+1) d["page_num_int"].append(pn+1)
d["top_int"].append(top) d["top_int"].append(top)
d["position_int"].append((pn+1, left, right, top, bottom)) d["position_int"].append((pn+1, left, right, top, bottom))
d["top_int"] = d["top_int"][:1]
def remove_contents_table(sections, eng=False): def remove_contents_table(sections, eng=False):
@ -172,7 +173,7 @@ def hierarchical_merge(bull, sections, depth):
def not_title(txt): def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
if len(txt) >= 128: return True if len(txt.split(" "))>12 or (txt.find(" ")<0 and len(txt)) >= 32: return True
return re.search(r"[,;,。;!!]", txt) return re.search(r"[,;,。;!!]", txt)
for i, (txt, layout) in enumerate(sections): for i, (txt, layout) in enumerate(sections):
@ -181,12 +182,12 @@ def hierarchical_merge(bull, sections, depth):
levels[j].append(i) levels[j].append(i)
break break
else: else:
if re.search(r"(title|head)", layout): if re.search(r"(title|head)", layout) and not not_title(txt):
levels[bullets_size].append(i) levels[bullets_size].append(i)
else: else:
levels[bullets_size + 1].append(i) levels[bullets_size + 1].append(i)
sections = [t for t, _ in sections] sections = [t for t, _ in sections]
for s in sections: print("--", s) #for s in sections: print("--", s)
def binary_search(arr, target): def binary_search(arr, target):
if not arr: return -1 if not arr: return -1
@ -220,11 +221,29 @@ def hierarchical_merge(bull, sections, depth):
if jj > cks[-1][-1]: cks[-1].pop(-1) if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj]) cks[-1].append(levels[ii][jj])
for ii in cks[-1]: readed[ii] = True for ii in cks[-1]: readed[ii] = True
if not cks:return cks
for i in range(len(cks)): for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]] cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i])) print("--------------\n", "\n* ".join(cks[i]))
return cks res = [[]]
num = [0]
for ck in cks:
if len(ck) == 1:
n = num_tokens_from_string(re.sub(r"@@[0-9]+.*", "", ck[0]))
if n + num[-1] < 218:
res[-1].append(ck[0])
num[-1] += n
continue
res.append(ck)
num.append(n)
continue
res.append(ck)
num.append(218)
return res
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):

View File

@ -46,7 +46,7 @@ def collect(tm):
def set_dispatching(docid): def set_dispatching(docid):
try: try:
DocumentService.update_by_id( DocumentService.update_by_id(
docid, {"progress": random.randint(0, 3) / 100., docid, {"progress": random.random()*1 / 100.,
"progress_msg": "Task dispatched...", "progress_msg": "Task dispatched...",
"process_begin_at": get_format_time() "process_begin_at": get_format_time()
}) })

View File

@ -72,6 +72,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
prog = -1 prog = -1
if to_page > 0: if to_page > 0:
if msg:
msg = f"Page({from_page}~{to_page}): " + msg msg = f"Page({from_page}~{to_page}): " + msg
d = {"progress_msg": msg} d = {"progress_msg": msg}
if prog is not None: if prog is not None:
@ -168,7 +169,7 @@ def init_kb(row):
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r"))) open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
def embedding(docs, mdl, parser_config={}): def embedding(docs, mdl, parser_config={}, callback=None):
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
d["content_with_weight"] for d in docs] d["content_with_weight"] for d in docs]
tk_count = 0 tk_count = 0
@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}):
tts, c = mdl.encode(tts) tts, c = mdl.encode(tts)
tk_count += c tk_count += c
cnts, c = mdl.encode(cnts) cnts_ = []
for i in range(0, len(cnts), 32):
vts, c = mdl.encode(cnts[i: i+32])
cnts_.extend(vts)
tk_count += c tk_count += c
callback(msg="")
cnts = cnts_
title_w = float(parser_config.get("filename_embd_weight", 0.1)) title_w = float(parser_config.get("filename_embd_weight", 0.1))
vects = (title_w * tts + (1 - title_w) * vects = (title_w * tts + (1 - title_w) *
cnts) if len(tts) == len(cnts) else cnts cnts) if len(tts) == len(cnts) else cnts
@ -218,10 +225,11 @@ def main(comm, mod):
# TODO: exception handler # TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ") ## set_progress(r["did"], -1, "ERROR: ")
try: try:
tk_count = embedding(cks, embd_mdl, r["parser_config"]) tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e: except Exception as e:
callback(-1, "Embedding error:{}".format(str(e))) callback(-1, "Embedding error:{}".format(str(e)))
cron_logger.error(str(e)) cron_logger.error(str(e))
tk_count = 0
callback(msg="Finished embedding! Start to build index!") callback(msg="Finished embedding! Start to build index!")
init_kb(r) init_kb(r)