From d88f0d43eae94d413b30ec5985174a9611b7382c Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Fri, 8 Nov 2024 12:48:11 +0800 Subject: [PATCH] make language judgement robuster (#3287) ### What problem does this PR solve? ### Type of change - [x] Performance Improvement --- rag/nlp/query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 79f730113..30abbc8bd 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -63,9 +63,9 @@ class EsQueryer: rag_tokenizer.tradi2simp( rag_tokenizer.strQ2B( txt.lower()))).strip() - txt = EsQueryer.rmWWW(txt) if not self.isChinese(txt): + txt = EsQueryer.rmWWW(txt) tks = rag_tokenizer.tokenize(txt).split(" ") tks_w = self.tw.weights(tks) tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w] @@ -89,6 +89,7 @@ class EsQueryer: return False return True + txt = EsQueryer.rmWWW(txt) qs, keywords = [], [] for tt in self.tw.split(txt)[:256]: # .split(" "): if not tt: