refine citation (#161)

This commit is contained in:
KevinHuSh 2024-03-28 11:45:50 +08:00 committed by GitHub
parent 37cc673098
commit f3477202fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 21 additions and 17 deletions

View File

@ -194,7 +194,8 @@ def chat(dialog, messages, **kwargs):
# try to use sql if field mapping is good to go # try to use sql if field mapping is good to go
if field_map: if field_map:
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1])) chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl) ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
if ans: return ans
prompt_config = dialog.prompt_config prompt_config = dialog.prompt_config
for p in prompt_config["parameters"]: for p in prompt_config["parameters"]:
@ -305,7 +306,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
tbl, sql = get_table() tbl, sql = get_table()
if tbl is None: if tbl is None:
return None, None return None
if tbl.get("error") and tried_times <= 2: if tbl.get("error") and tried_times <= 2:
user_promt = """ user_promt = """
表名{} 表名{}
@ -333,7 +334,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
chat_logger.info("GET table: {}".format(tbl)) chat_logger.info("GET table: {}".format(tbl))
print(tbl) print(tbl)
if tbl.get("error") or len(tbl["rows"]) == 0: if tbl.get("error") or len(tbl["rows"]) == 0:
return None, None return None
docid_idx = set([ii for ii, c in enumerate( docid_idx = set([ii for ii, c in enumerate(
tbl["columns"]) if c["name"] == "doc_id"]) tbl["columns"]) if c["name"] == "doc_id"])

View File

@ -120,7 +120,7 @@ class Pdf(PdfParser):
print(tbls) print(tbls)
return { return {
"title": title if title else filename, "title": title,
"authors": " ".join(authors), "authors": " ".join(authors),
"abstract": abstr, "abstract": abstr,
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if

View File

@ -246,19 +246,22 @@ class Dealer:
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
for ck in chunks] for ck in chunks]
cites = {} cites = {}
for i, a in enumerate(pieces_): thr = 0.63
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], while len(cites.keys()) == 0 and pieces_ and chunks_tks:
chunk_v, for i, a in enumerate(pieces_):
huqie.qie( sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
self.qryr.rmWWW(pieces_[i])).split(" "), chunk_v,
chunks_tks, huqie.qie(
tkweight, vtweight) self.qryr.rmWWW(pieces_[i])).split(" "),
mx = np.max(sim) * 0.99 chunks_tks,
es_logger.info("{} SIM: {}".format(pieces_[i], mx)) tkweight, vtweight)
if mx < 0.63: mx = np.max(sim) * 0.99
continue es_logger.info("{} SIM: {}".format(pieces_[i], mx))
cites[idx[i]] = list( if mx < thr:
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] continue
cites[idx[i]] = list(
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
thr *= 0.8
res = "" res = ""
seted = set([]) seted = set([])