ragflow/rag/nlp/search.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import logging
import re
from dataclasses import dataclass

from rag.utils import rmSpace
from rag.nlp import rag_tokenizer, query
import numpy as np
from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr


def index_name(uid): return f"ragflow_{uid}"


class Dealer:
    def __init__(self, dataStore: DocStoreConnection):
        self.qryr = query.FulltextQueryer()
        self.dataStore = dataStore

    @dataclass
    class SearchResult:
        total: int
        ids: list[str]
        query_vector: list[float] | None = None
        field: dict | None = None
        highlight: dict | None = None
        aggregation: list | dict | None = None
        keywords: list[str] | None = None
        group_docs: list[list] | None = None

    def get_vector(self, txt, emb_mdl, topk=10, similarity=0.1):
        qv, _ = emb_mdl.encode_queries(txt)
        shape = np.array(qv).shape
        if len(shape) > 1:
            raise Exception(f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
        embedding_data = [float(v) for v in qv]
        vector_column_name = f"q_{len(embedding_data)}_vec"
        return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})

    def get_filters(self, req):
        condition = dict()
        for key, field in {"kb_ids": "kb_id", "doc_ids": "doc_id"}.items():
            if key in req and req[key] is not None:
                condition[field] = req[key]
        # TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
        for key in ["knowledge_graph_kwd", "available_int"]:
            if key in req and req[key] is not None:
                condition[key] = req[key]
        return condition

    def search(self, req, idx_names: str | list[str], kb_ids: list[str], emb_mdl=None, highlight = False):
        filters = self.get_filters(req)
        orderBy = OrderByExpr()

        pg = int(req.get("page", 1)) - 1
        topk = int(req.get("topk", 1024))
        ps = int(req.get("size", topk))
        offset, limit = pg * ps, ps

        src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                                 "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
                                 "available_int", "content_with_weight", "pagerank_fea"])
        kwds = set([])

        qst = req.get("question", "")
        q_vec = []
        if not qst:
            if req.get("sort"):
                orderBy.asc("page_num_int")
                orderBy.asc("top_int")
                orderBy.desc("create_timestamp_flt")
            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
            total=self.dataStore.getTotal(res)
            logging.debug("Dealer.search TOTAL: {}".format(total))
        else:
            highlightFields = ["content_ltks", "title_tks"] if highlight else []
            matchText, keywords = self.qryr.question(qst, min_match=0.3)
            if emb_mdl is None:
                matchExprs = [matchText]
                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
                total=self.dataStore.getTotal(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))
            else:
                matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
                q_vec = matchDense.embedding_data
                src.append(f"q_{len(q_vec)}_vec")

                fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
                matchExprs = [matchText, matchDense, fusionExpr]

                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
                total=self.dataStore.getTotal(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))

                # If result is empty, try again with lower min_match
                if total == 0:
                    matchText, _ = self.qryr.question(qst, min_match=0.1)
                    filters.pop("doc_ids", None)
                    matchDense.extra_options["similarity"] = 0.17
                    res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
                    total=self.dataStore.getTotal(res)
                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))

            for k in keywords:
                kwds.add(k)
                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
                    if len(kk) < 2:
                        continue
                    if kk in kwds:
                        continue
                    kwds.add(kk)

        logging.debug(f"TOTAL: {total}")
        ids=self.dataStore.getChunkIds(res)
        keywords=list(kwds)
        highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
        aggs = self.dataStore.getAggregation(res, "docnm_kwd")
        return self.SearchResult(
            total=total,
            ids=ids,
            query_vector=q_vec,
            aggregation=aggs,
            highlight=highlight,
            field=self.dataStore.getFields(res, src),
            keywords=keywords
        )

    @staticmethod
    def trans2floats(txt):
        return [float(t) for t in txt.split("\t")]

    def insert_citations(self, answer, chunks, chunk_v,
                         embd_mdl, tkweight=0.1, vtweight=0.9):
        assert len(chunks) == len(chunk_v)
        if not chunks:
            return answer, set([])
        pieces = re.split(r"(```)", answer)
        if len(pieces) >= 3:
            i = 0
            pieces_ = []
            while i < len(pieces):
                if pieces[i] == "```":
                    st = i
                    i += 1
                    while i < len(pieces) and pieces[i] != "```":
                        i += 1
                    if i < len(pieces):
                        i += 1
                    pieces_.append("".join(pieces[st: i]) + "\n")
                else:
                    pieces_.extend(
                        re.split(
                            r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])",
                            pieces[i]))
                    i += 1
            pieces = pieces_
        else:
            pieces = re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", answer)
        for i in range(1, len(pieces)):
            if re.match(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]):
                pieces[i - 1] += pieces[i][0]
                pieces[i] = pieces[i][1:]
        idx = []
        pieces_ = []
        for i, t in enumerate(pieces):
            if len(t) < 5:
                continue
            idx.append(i)
            pieces_.append(t)
        logging.debug("{} => {}".format(answer, pieces_))
        if not pieces_:
            return answer, set([])

        ans_v, _ = embd_mdl.encode(pieces_)
        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
                len(ans_v[0]), len(chunk_v[0]))

        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
                      for ck in chunks]
        cites = {}
        thr = 0.63
        while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
            for i, a in enumerate(pieces_):
                sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                                chunk_v,
                                                                rag_tokenizer.tokenize(
                                                                    self.qryr.rmWWW(pieces_[i])).split(),
                                                                chunks_tks,
                                                                tkweight, vtweight)
                mx = np.max(sim) * 0.99
                logging.debug("{} SIM: {}".format(pieces_[i], mx))
                if mx < thr:
                    continue
                cites[idx[i]] = list(
                    set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
            thr *= 0.8

        res = ""
        seted = set([])
        for i, p in enumerate(pieces):
            res += p
            if i not in idx:
                continue
            if i not in cites:
                continue
            for c in cites[i]:
                assert int(c) < len(chunk_v)
            for c in cites[i]:
                if c in seted:
                    continue
                res += f" ##{c}$$"
                seted.add(c)

        return res, seted

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks"):
        _, keywords = self.qryr.question(query)
        vector_size = len(sres.query_vector)
        vector_column = f"q_{vector_size}_vec"
        zero_vector = [0.0] * vector_size
        ins_embd = []
        pageranks = []
        for chunk_id in sres.ids:
            vector = sres.field[chunk_id].get(vector_column, zero_vector)
            if isinstance(vector, str):
                vector = [float(v) for v in vector.split("\t")]
            ins_embd.append(vector)
            pageranks.append(sres.field[chunk_id].get("pagerank_fea", 0))
        if not ins_embd:
            return [], [], []

        for i in sres.ids:
            if isinstance(sres.field[i].get("important_kwd", []), str):
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
            content_ltks = sres.field[i][cfield].split()
            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
            ins_tw.append(tks)

        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                        ins_embd,
                                                        keywords,
                                                        ins_tw, tkweight, vtweight)

        return sim+np.array(pageranks, dtype=float), tksim, vtsim

    def rerank_by_model(self, rerank_mdl, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks"):
        _, keywords = self.qryr.question(query)

        for i in sres.ids:
            if isinstance(sres.field[i].get("important_kwd", []), str):
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
            content_ltks = sres.field[i][cfield].split()
            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks + important_kwd
            ins_tw.append(tks)

        tksim = self.qryr.token_similarity(keywords, ins_tw)
        vtsim,_ = rerank_mdl.similarity(query, [rmSpace(" ".join(tks)) for tks in ins_tw])

        return tkweight*np.array(tksim) + vtweight*vtsim, tksim, vtsim

    def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
        return self.qryr.hybrid_similarity(ans_embd,
                                           ins_embd,
                                           rag_tokenizer.tokenize(ans).split(),
                                           rag_tokenizer.tokenize(inst).split())

    def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
                  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
        if not question:
            return ranks

        RERANK_PAGE_LIMIT = 3
        req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size*RERANK_PAGE_LIMIT, 128),
               "question": question, "vector": True, "topk": top,
               "similarity": similarity_threshold,
               "available_int": 1}

        if page > RERANK_PAGE_LIMIT:
            req["page"] = page
            req["size"] = page_size

        if isinstance(tenant_ids, str):
            tenant_ids = tenant_ids.split(",")

        sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight)
        ranks["total"] = sres.total

        if page <= RERANK_PAGE_LIMIT:
            if rerank_mdl and sres.total > 0:
                sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
                    sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
            else:
                sim, tsim, vsim = self.rerank(
                    sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
            idx = np.argsort(sim * -1)[(page-1)*page_size:page*page_size]
        else:
            sim = tsim = vsim = [1]*len(sres.ids)
            idx = list(range(len(sres.ids)))

        def floor_sim(score):
            return (int(score * 100.)%100)/100.

        dim = len(sres.query_vector)
        vector_column = f"q_{dim}_vec"
        zero_vector = [0.0] * dim
        for i in idx:
            if floor_sim(sim[i]) < similarity_threshold:
                break
            if len(ranks["chunks"]) >= page_size:
                if aggs:
                    continue
                break
            id = sres.ids[i]
            chunk = sres.field[id]
            dnm = chunk["docnm_kwd"]
            did = chunk["doc_id"]
            position_int = chunk.get("position_int", [])
            d = {
                "chunk_id": id,
                "content_ltks": chunk["content_ltks"],
                "content_with_weight": chunk["content_with_weight"],
                "doc_id": chunk["doc_id"],
                "docnm_kwd": dnm,
                "kb_id": chunk["kb_id"],
                "important_kwd": chunk.get("important_kwd", []),
                "image_id": chunk.get("img_id", ""),
                "similarity": sim[i],
                "vector_similarity": vsim[i],
                "term_similarity": tsim[i],
                "vector": chunk.get(vector_column, zero_vector),
                "positions": position_int,
            }
            if highlight and sres.highlight:
                if id in sres.highlight:
                    d["highlight"] = rmSpace(sres.highlight[id])
                else:
                    d["highlight"] = d["content_with_weight"]
            ranks["chunks"].append(d)
            if dnm not in ranks["doc_aggs"]:
                ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
            ranks["doc_aggs"][dnm]["count"] += 1
        ranks["doc_aggs"] = [{"doc_name": k,
                              "doc_id": v["doc_id"],
                              "count": v["count"]} for k,
                             v in sorted(ranks["doc_aggs"].items(),
                                         key=lambda x:x[1]["count"] * -1)]
        ranks["chunks"] = ranks["chunks"][:page_size]

        return ranks

    def sql_retrieval(self, sql, fetch_size=128, format="json"):
        tbl = self.dataStore.sql(sql, fetch_size, format)
        return tbl

    def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
        condition = {"doc_id": doc_id}
        res = []
        bs = 128
        for p in range(0, max_count, bs):
            es_res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), p, bs, index_name(tenant_id), kb_ids)
            dict_chunks = self.dataStore.getFields(es_res, fields)
            if dict_chunks:
                res.extend(dict_chunks.values())
            if len(dict_chunks.values()) < bs:
                break
        return res
-												Update info (#1005)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-05-31 09:53:04 +08:00
+								#
 								#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 								#
 								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
 								#
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								import logging
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								import re
 								from dataclasses import dataclass
-												change licence (#28)

* add front end code

* change licence
											
										
										
											2024-01-17 09:39:50 +08:00
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								from rag.utils import rmSpace
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								from rag.nlp import rag_tokenizer, query
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								import numpy as np
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												add alot of api (#23)

* clean rust version project

* clean rust version project

* build python version rag-flow

* add alot of api
											
										
										
											2024-01-15 19:47:25 +08:00
+								def index_name(uid): return f"ragflow_{uid}"
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												build dialog server; add thumbnail to docinfo; (#17)


											
										
										
											2023-12-26 19:32:06 +08:00
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								class Dealer:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								    def __init__(self, dataStore: DocStoreConnection):
 								        self.qryr = query.FulltextQueryer()
 								        self.dataStore = dataStore
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
 								    @dataclass
 								    class SearchResult:
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        total: int
-												Introduced beartype (#3460)

### What problem does this PR solve?

Introduced [beartype](https://github.com/beartype/beartype) for runtime
type-checking.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-11-18 17:38:17 +08:00
+								        ids: list[str]
 								        query_vector: list[float] | None = None
 								        field: dict | None = None
 								        highlight: dict | None = None
 								        aggregation: list | dict | None = None
 								        keywords: list[str] | None = None
 								        group_docs: list[list] | None = None
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								    def get_vector(self, txt, emb_mdl, topk=10, similarity=0.1):
 								        qv, _ = emb_mdl.encode_queries(txt)
-												Detect shape error of embedding (#3710)

### What problem does this PR solve?

Detect shape error of embedding. Close #2997

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 14:10:22 +08:00
+								        shape = np.array(qv).shape
 								        if len(shape) > 1:
 								            raise Exception(f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        embedding_data = [float(v) for v in qv]
 								        vector_column_name = f"q_{len(embedding_data)}_vec"
 								        return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})
 								    def get_filters(self, req):
 								        condition = dict()
 								        for key, field in {"kb_ids": "kb_id", "doc_ids": "doc_id"}.items():
 								            if key in req and req[key] is not None:
 								                condition[field] = req[key]
 								        # TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
-												Fix chunk enable/disable issue (#3579)

### What problem does this PR solve?

#3576

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-22 12:25:42 +08:00
+								        for key in ["knowledge_graph_kwd", "available_int"]:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            if key in req and req[key] is not None:
 								                condition[key] = req[key]
 								        return condition
-												fix(rag): fix error in viewing document chunk and cannot start task_executor server (#3481)

### What problem does this PR solve?

1. Fix error in viewing document chunk

<img width="1677" alt="Pasted Graphic"
src="https://github.com/user-attachments/assets/acd84cde-f38c-4190-b135-5e5139ae2613">

Viewing document chunk details in a BeartypeCallHintParamViolation
error.

Traceback (most recent call last):
File "ragflow/.venv/lib/python3.12/site-packages/flask/app.py", line
880, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
File "ragflow/.venv/lib/python3.12/site-packages/flask/app.py", line
865, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
# type: ignore[no-any-return]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "ragflow/.venv/lib/python3.12/site-packages/flask_login/utils.py",
line 290, in decorated_view
    return current_app.ensure_sync(func)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "ragflow/api/apps/chunk_app.py", line 311, in knowledge_graph
sres = settings.retrievaler.search(req, search.index_name(tenant_id),
kb_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<@beartype(rag.nlp.search.Dealer.search) at 0x3381fd800>", line
39, in search
beartype.roar.BeartypeCallHintParamViolation: Method
rag.nlp.search.Dealer.search() parameter
idx_names='ragflow_0e1e67f431d711ef98fc00155d29195d' violates type hint
list[str], as str 'ragflow_0e1e67f431d711ef98fc00155d29195d' not
instance of list.
2024-11-19 11:30:29,817 ERROR 91013 Method
rag.nlp.search.Dealer.search() parameter
idx_names='ragflow_0e1e67f431d711ef98fc00155d29195d' violates type hint
list[str], as str 'ragflow_0e1e67f431d711ef98fc00155d29195d' not
instance of list.
Traceback (most recent call last):
  File "ragflow/api/apps/chunk_app.py", line 60, in list_chunk
sres = settings.retrievaler.search(query, search.index_name(tenant_id),
kb_ids, highlight=True)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<@beartype(rag.nlp.search.Dealer.search) at 0x3381fd800>", line
39, in search
beartype.roar.BeartypeCallHintParamViolation: Method
rag.nlp.search.Dealer.search() parameter
idx_names='ragflow_0e1e67f431d711ef98fc00155d29195d' violates type hint
list[str], as str 'ragflow_0e1e67f431d711ef98fc00155d29195d' not
instance of list.


because in nlp/search.py,the idx_names is only list

<img width="1098" alt="Pasted Graphic 2"
src="https://github.com/user-attachments/assets/4998cb1e-94bc-470b-b2f4-41ecb5b08f8a">

but the DocStoreConnection.search method accept list or str
<img width="1175" alt="Pasted Graphic 3"
src="https://github.com/user-attachments/assets/ee918b4a-87a5-42c9-a6d2-d0db0884b875">


and his implements also list and str
es_conn.py

<img width="1121" alt="Pasted Graphic 4"
src="https://github.com/user-attachments/assets/3e6dc030-0a0d-416c-8fd4-0b4cfd576f8c">

infinity_conn.py

<img width="1221" alt="Pasted Graphic 5"
src="https://github.com/user-attachments/assets/44edac2b-6b81-45b0-a3fc-cb1c63219015">

2. Fix cannot star task_executor server with Unresolved reference
'Mapping'
<img width="1283" alt="Pasted Graphic 6"
src="https://github.com/user-attachments/assets/421f17b8-d0a5-46d3-bc4d-d05dc9dfc934">

### Type of change

- [X] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-19 14:36:10 +08:00
+								    def search(self, req, idx_names: str | list[str], kb_ids: list[str], emb_mdl=None, highlight = False):
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        filters = self.get_filters(req)
 								        orderBy = OrderByExpr()
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        pg = int(req.get("page", 1)) - 1
-												add use layout or not option (#145)

* add use layout or not option

* trival
											
										
										
											2024-03-22 19:21:09 +08:00
+								        topk = int(req.get("topk", 1024))
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								        ps = int(req.get("size", topk))
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								        offset, limit = pg * ps, ps
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
-												Fix position lost issue. (#4068)

### What problem does this PR solve?

#4040

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-12-17 16:31:58 +08:00
+								        src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								                                 "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
+								                                 "available_int", "content_with_weight", "pagerank_fea"])
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        kwds = set([])
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        qst = req.get("question", "")
 								        q_vec = []
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        if not qst:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            if req.get("sort"):
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								                orderBy.asc("page_num_int")
 								                orderBy.asc("top_int")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                orderBy.desc("create_timestamp_flt")
 								            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
 								            total=self.dataStore.getTotal(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								            logging.debug("Dealer.search TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        else:
 								            highlightFields = ["content_ltks", "title_tks"] if highlight else []
 								            matchText, keywords = self.qryr.question(qst, min_match=0.3)
 								            if emb_mdl is None:
 								                matchExprs = [matchText]
 								                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
 								                total=self.dataStore.getTotal(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("Dealer.search TOTAL: {}".format(total))
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            else:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
 								                q_vec = matchDense.embedding_data
 								                src.append(f"q_{len(q_vec)}_vec")
 								                fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
 								                matchExprs = [matchText, matchDense, fusionExpr]
 								                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
 								                total=self.dataStore.getTotal(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("Dealer.search TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
 								                # If result is empty, try again with lower min_match
 								                if total == 0:
 								                    matchText, _ = self.qryr.question(qst, min_match=0.1)
-												Fix logs. Use dict.pop instead of del. Close #3473 (#3484)

### What problem does this PR solve?

Fix logs. Use dict.pop instead of del.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-19 14:15:25 +08:00
+								                    filters.pop("doc_ids", None)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                    matchDense.extra_options["similarity"] = 0.17
 								                    res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
 								                    total=self.dataStore.getTotal(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
 								            for k in keywords:
 								                kwds.add(k)
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                    if len(kk) < 2:
 								                        continue
 								                    if kk in kwds:
 								                        continue
 								                    kwds.add(kk)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								        logging.debug(f"TOTAL: {total}")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        ids=self.dataStore.getChunkIds(res)
 								        keywords=list(kwds)
 								        highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
 								        aggs = self.dataStore.getAggregation(res, "docnm_kwd")
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								        return self.SearchResult(
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            total=total,
 								            ids=ids,
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								            query_vector=q_vec,
 								            aggregation=aggs,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            highlight=highlight,
 								            field=self.dataStore.getFields(res, src),
 								            keywords=keywords
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								        )
 								    @staticmethod
 								    def trans2floats(txt):
 								        return [float(t) for t in txt.split("\t")]
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								    def insert_citations(self, answer, chunks, chunk_v,
-												Add 'One' chunk method (#137)


											
										
										
											2024-03-20 18:57:22 +08:00
+								                         embd_mdl, tkweight=0.1, vtweight=0.9):
-												refine admin initialization (#75)


											
										
										
											2024-02-27 14:57:34 +08:00
+								        assert len(chunks) == len(chunk_v)
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        if not chunks:
 								            return answer, set([])
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								        pieces = re.split(r"(```)", answer)
 								        if len(pieces) >= 3:
 								            i = 0
 								            pieces_ = []
 								            while i < len(pieces):
 								                if pieces[i] == "```":
 								                    st = i
 								                    i += 1
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    while i < len(pieces) and pieces[i] != "```":
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                        i += 1
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    if i < len(pieces):
 								                        i += 1
 								                    pieces_.append("".join(pieces[st: i]) + "\n")
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                else:
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    pieces_.extend(
 								                        re.split(
 								                            r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])",
 								                            pieces[i]))
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                    i += 1
 								            pieces = pieces_
 								        else:
 								            pieces = re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", answer)
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        for i in range(1, len(pieces)):
-												refine manual parser (#140)


											
										
										
											2024-03-21 18:17:32 +08:00
+								            if re.match(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]):
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                pieces[i - 1] += pieces[i][0]
 								                pieces[i] = pieces[i][1:]
 								        idx = []
 								        pieces_ = []
 								        for i, t in enumerate(pieces):
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								            if len(t) < 5:
 								                continue
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            idx.append(i)
 								            pieces_.append(t)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								        logging.debug("{} => {}".format(answer, pieces_))
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        if not pieces_:
-												fix gb2312 encoding issue (#394)

### What problem does this PR solve?

Issue link:#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-04-16 19:45:14 +08:00
+								            return answer, set([])
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        ans_v, _ = embd_mdl.encode(pieces_)
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								                len(ans_v[0]), len(chunk_v[0]))
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                      for ck in chunks]
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        cites = {}
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								        thr = 0.63
-												let's load model from local (#163)


											
										
										
											2024-03-28 16:10:47 +08:00
+								        while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								            for i, a in enumerate(pieces_):
 								                sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
 								                                                                chunk_v,
-												refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-04-28 19:13:33 +08:00
+								                                                                rag_tokenizer.tokenize(
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                                                                    self.qryr.rmWWW(pieces_[i])).split(),
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								                                                                chunks_tks,
 								                                                                tkweight, vtweight)
 								                mx = np.max(sim) * 0.99
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("{} SIM: {}".format(pieces_[i], mx))
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								                if mx < thr:
 								                    continue
 								                cites[idx[i]] = list(
 								                    set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
 								            thr *= 0.8
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        res = ""
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								        seted = set([])
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        for i, p in enumerate(pieces):
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            res += p
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								            if i not in idx:
 								                continue
 								            if i not in cites:
 								                continue
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								            for c in cites[i]:
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                assert int(c) < len(chunk_v)
 								            for c in cites[i]:
 								                if c in seted:
 								                    continue
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								                res += f" ##{c}$$"
 								                seted.add(c)
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												refine presentation parser (#110)


											
										
										
											2024-03-07 17:21:38 +08:00
+								        return res, seted
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								    def rerank(self, sres, query, tkweight=0.3,
 								               vtweight=0.7, cfield="content_ltks"):
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								        _, keywords = self.qryr.question(query)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_size = len(sres.query_vector)
 								        vector_column = f"q_{vector_size}_vec"
 								        zero_vector = [0.0] * vector_size
 								        ins_embd = []
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
+								        pageranks = []
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        for chunk_id in sres.ids:
 								            vector = sres.field[chunk_id].get(vector_column, zero_vector)
 								            if isinstance(vector, str):
 								                vector = [float(v) for v in vector.split("\t")]
 								            ins_embd.append(vector)
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
+								            pageranks.append(sres.field[chunk_id].get("pagerank_fea", 0))
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        if not ins_embd:
-												Test APIs and fix bugs (#41)


											
										
										
											2024-01-22 19:51:38 +08:00
+								            return [], [], []
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
 								        for i in sres.ids:
 								            if isinstance(sres.field[i].get("important_kwd", []), str):
 								                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
 								        ins_tw = []
 								        for i in sres.ids:
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								            content_ltks = sres.field[i][cfield].split()
 								            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								            question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
+								            important_kwd = sres.field[i].get("important_kwd", [])
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								            tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
+								            ins_tw.append(tks)
-												add dialog api (#33)


											
										
										
											2024-01-17 20:20:42 +08:00
+								        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                                                        ins_embd,
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                                                        keywords,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                                                        ins_tw, tkweight, vtweight)
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
 								        return sim+np.array(pageranks, dtype=float), tksim, vtsim
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								    def rerank_by_model(self, rerank_mdl, sres, query, tkweight=0.3,
 								               vtweight=0.7, cfield="content_ltks"):
 								        _, keywords = self.qryr.question(query)
 								        for i in sres.ids:
 								            if isinstance(sres.field[i].get("important_kwd", []), str):
 								                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
 								        ins_tw = []
 								        for i in sres.ids:
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								            content_ltks = sres.field[i][cfield].split()
 								            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								            important_kwd = sres.field[i].get("important_kwd", [])
 								            tks = content_ltks + title_tks + important_kwd
 								            ins_tw.append(tks)
 								        tksim = self.qryr.token_similarity(keywords, ins_tw)
-												refine using rerank model (#2553)

### What problem does this PR solve?

#2552

### Type of change

- [x] Performance Improvement
											
										
										
											2024-09-24 12:38:18 +08:00
+								        vtsim,_ = rerank_mdl.similarity(query, [rmSpace(" ".join(tks)) for tks in ins_tw])
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
 								        return tkweight*np.array(tksim) + vtweight*vtsim, tksim, vtsim
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								    def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
 								        return self.qryr.hybrid_similarity(ans_embd,
 								                                           ins_embd,
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                                           rag_tokenizer.tokenize(ans).split(),
 								                                           rag_tokenizer.tokenize(inst).split())
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
+								    def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
-												add search TAB backend api (#2375)

### What problem does this PR solve?
 #2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-11 19:49:18 +08:00
+								                  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
-												Test APIs and fix bugs (#41)


											
										
										
											2024-01-22 19:51:38 +08:00
+								        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        if not question:
 								            return ranks
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        RERANK_PAGE_LIMIT = 3
-												expand rerank range (#2746)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
											
										
										
											2024-10-08 16:34:33 +08:00
+								        req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size*RERANK_PAGE_LIMIT, 128),
-												add use layout or not option (#145)

* add use layout or not option

* trival
											
										
										
											2024-03-22 19:21:09 +08:00
+								               "question": question, "vector": True, "topk": top,
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								               "similarity": similarity_threshold,
 								               "available_int": 1}
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        if page > RERANK_PAGE_LIMIT:
 								            req["page"] = page
 								            req["size"] = page_size
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
 								        if isinstance(tenant_ids, str):
 								            tenant_ids = tenant_ids.split(",")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight)
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        ranks["total"] = sres.total
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        if page <= RERANK_PAGE_LIMIT:
-												fix: stop rerank by model when search result is empty (#4203)

### What problem does this PR solve?


stop rerank by model when search result is empty, otherwise rerank may
raise an error (qwen).

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: 刘博 <liubo@ynby.cn>
											
										
										
											2024-12-24 14:33:46 +08:00
+								            if rerank_mdl and sres.total > 0:
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								                sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
 								                    sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
 								            else:
 								                sim, tsim, vsim = self.rerank(
 								                    sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
 								            idx = np.argsort(sim * -1)[(page-1)*page_size:page*page_size]
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								        else:
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								            sim = tsim = vsim = [1]*len(sres.ids)
 								            idx = list(range(len(sres.ids)))
-												Test APIs and fix bugs (#41)


											
										
										
											2024-01-22 19:51:38 +08:00
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								        def floor_sim(score):
 								            return (int(score * 100.)%100)/100.
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        dim = len(sres.query_vector)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_column = f"q_{dim}_vec"
 								        zero_vector = [0.0] * dim
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        for i in idx:
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								            if floor_sim(sim[i]) < similarity_threshold:
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                break
-												refine presentation parser (#110)


											
										
										
											2024-03-07 17:21:38 +08:00
+								            if len(ranks["chunks"]) >= page_size:
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                if aggs:
 								                    continue
 								                break
 								            id = sres.ids[i]
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            chunk = sres.field[id]
 								            dnm = chunk["docnm_kwd"]
 								            did = chunk["doc_id"]
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								            position_int = chunk.get("position_int", [])
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            d = {
 								                "chunk_id": id,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "content_ltks": chunk["content_ltks"],
 								                "content_with_weight": chunk["content_with_weight"],
 								                "doc_id": chunk["doc_id"],
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                "docnm_kwd": dnm,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "kb_id": chunk["kb_id"],
 								                "important_kwd": chunk.get("important_kwd", []),
 								                "image_id": chunk.get("img_id", ""),
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                "similarity": sim[i],
 								                "vector_similarity": vsim[i],
 								                "term_similarity": tsim[i],
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "vector": chunk.get(vector_column, zero_vector),
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								                "positions": position_int,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            }
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
+								            if highlight and sres.highlight:
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								                if id in sres.highlight:
 								                    d["highlight"] = rmSpace(sres.highlight[id])
 								                else:
 								                    d["highlight"] = d["content_with_weight"]
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            ranks["chunks"].append(d)
 								            if dnm not in ranks["doc_aggs"]:
-												refactor retieval_test, add SQl retrieval methods (#61)


											
										
										
											2024-02-08 17:01:01 +08:00
+								                ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
 								            ranks["doc_aggs"][dnm]["count"] += 1
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								        ranks["doc_aggs"] = [{"doc_name": k,
 								                              "doc_id": v["doc_id"],
 								                              "count": v["count"]} for k,
 								                             v in sorted(ranks["doc_aggs"].items(),
 								                                         key=lambda x:x[1]["count"] * -1)]
-												Fix page size error. (#4401)

### What problem does this PR solve?

#4400

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-01-07 19:06:31 +08:00
+								        ranks["chunks"] = ranks["chunks"][:page_size]
-												change licence (#28)

* add front end code

* change licence
											
										
										
											2024-01-17 09:39:50 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        return ranks
-												Add resume parser and fix bugs (#59)

* Update .gitignore

* Update .gitignore

* Add resume parser and fix bugs
											
										
										
											2024-02-07 19:27:23 +08:00
-												Refine resume parts and fix bugs in retrival using sql (#66)


											
										
										
											2024-02-19 19:22:17 +08:00
+								    def sql_retrieval(self, sql, fetch_size=128, format="json"):
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        tbl = self.dataStore.sql(sql, fetch_size, format)
 								        return tbl
 								    def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
 								        condition = {"doc_id": doc_id}
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								        res = []
 								        bs = 128
 								        for p in range(0, max_count, bs):
-												Fix raptor bug. (#4192)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-12-23 18:59:48 +08:00
+								            es_res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), p, bs, index_name(tenant_id), kb_ids)
 								            dict_chunks = self.dataStore.getFields(es_res, fields)
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								            if dict_chunks:
 								                res.extend(dict_chunks.values())
 								            if len(dict_chunks.values()) < bs:
 								                break
 								        return res