diff --git a/api/db/init_data.py b/api/db/init_data.py index f10aa6461..b46b27ce6 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -84,14 +84,14 @@ def init_superuser(): {"role": "user", "content": "Hello!"}], gen_conf={}) if msg.find("ERROR: ") == 0: logging.error( - "'{}' dosen't work. {}".format( + "'{}' doesn't work. {}".format( tenant["llm_id"], msg)) embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) v, c = embd_mdl.encode(["Hello!"]) if c == 0: logging.error( - "'{}' dosen't work!".format( + "'{}' doesn't work!".format( tenant["embd_id"])) diff --git a/api/db/services/canvas_service.py b/api/db/services/canvas_service.py index 118403474..8bcb7b1bc 100644 --- a/api/db/services/canvas_service.py +++ b/api/db/services/canvas_service.py @@ -73,11 +73,11 @@ class UserCanvasService(CommonService): User.nickname, User.avatar.alias('tenant_avatar'), ] - angents = cls.model.select(*fields) \ + agents = cls.model.select(*fields) \ .join(User, on=(cls.model.user_id == User.id)) \ .where(cls.model.id == pid) # obj = cls.model.query(id=pid)[0] - return True, angents.dicts()[0] + return True, agents.dicts()[0] except Exception as e: print(e) return False, None @@ -100,25 +100,25 @@ class UserCanvasService(CommonService): cls.model.update_time ] if keywords: - angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( + agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == TenantPermission.TEAM.value)) | ( cls.model.user_id == user_id)), (fn.LOWER(cls.model.title).contains(keywords.lower())) ) else: - angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( + agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == TenantPermission.TEAM.value)) | ( cls.model.user_id == user_id)) ) if desc: - angents = angents.order_by(cls.model.getter_by(orderby).desc()) + agents = agents.order_by(cls.model.getter_by(orderby).desc()) else: - angents = angents.order_by(cls.model.getter_by(orderby).asc()) - count = angents.count() - angents = angents.paginate(page_number, items_per_page) - return list(angents.dicts()), count + agents = agents.order_by(cls.model.getter_by(orderby).asc()) + count = agents.count() + agents = agents.paginate(page_number, items_per_page) + return list(agents.dicts()), count def completion(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): diff --git a/api/ragflow_server.py b/api/ragflow_server.py index f036967bc..75bc8916c 100644 --- a/api/ragflow_server.py +++ b/api/ragflow_server.py @@ -18,9 +18,9 @@ # from beartype.claw import beartype_all # <-- you didn't sign up for this # beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code -from api.utils.log_utils import initRootLogger +from api.utils.log_utils import init_root_logger from plugin import GlobalPluginManager -initRootLogger("ragflow_server") +init_root_logger("ragflow_server") import logging import os diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index b90527c70..7fefc54a6 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -158,7 +158,7 @@ def filename_type(filename): if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): return FileType.DOC.value - if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): + if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename): return FileType.AURAL.value if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): diff --git a/api/utils/log_utils.py b/api/utils/log_utils.py index 7d07f7909..3ebedd148 100644 --- a/api/utils/log_utils.py +++ b/api/utils/log_utils.py @@ -30,7 +30,7 @@ def get_project_base_directory(): ) return PROJECT_BASE -def initRootLogger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"): +def init_root_logger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"): global initialized_root_logger if initialized_root_logger: return diff --git a/api/utils/t_crypt.py b/api/utils/t_crypt.py index cd9d1edcc..d0763c19f 100644 --- a/api/utils/t_crypt.py +++ b/api/utils/t_crypt.py @@ -35,6 +35,6 @@ def crypt(line): if __name__ == "__main__": - pswd = crypt(sys.argv[1]) - print(pswd) - print(decrypt(pswd)) + passwd = crypt(sys.argv[1]) + print(passwd) + print(decrypt(passwd)) diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index 21c731aa0..d60dc5561 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -312,7 +312,7 @@ class PermissionEnum(StrEnum): team = auto() -class ChunkMethodnEnum(StrEnum): +class ChunkMethodEnum(StrEnum): naive = auto() book = auto() email = auto() @@ -382,7 +382,7 @@ class CreateDatasetReq(Base): description: str | None = Field(default=None, max_length=65535) embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")] permission: PermissionEnum = Field(default=PermissionEnum.me, min_length=1, max_length=16) - chunk_method: ChunkMethodnEnum = Field(default=ChunkMethodnEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id") + chunk_method: ChunkMethodEnum = Field(default=ChunkMethodEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id") parser_config: ParserConfig | None = Field(default=None) @field_validator("avatar") diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index dfe3f37fd..f37119615 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -69,7 +69,7 @@ class RAGFlowDocxParser: max_type = max(max_type.items(), key=lambda x: x[1])[0] colnm = len(df.iloc[0, :]) - hdrows = [0] # header is not nessesarily appear in the first line + hdrows = [0] # header is not necessarily appear in the first line if max_type == "Nu": for r in range(1, len(df)): tys = Counter([blockType(str(df.iloc[r, j])) diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 49263630d..b29a4a8a5 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -21,7 +21,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.prompts import vision_llm_figure_describe_prompt -def vision_figure_parser_figure_data_wraper(figures_data_without_positions): +def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): return [ ( (figure_data[1], [figure_data[0]]), diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index dbc6cc320..68c894697 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -180,13 +180,13 @@ class RAGFlowPdfParser: return fea @staticmethod - def sort_X_by_page(arr, threashold): + def sort_X_by_page(arr, threshold): # sort using y1 first and then x1 arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) for i in range(len(arr) - 1): for j in range(i, -1, -1): # restore the order using th - if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ + if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \ and arr[j + 1]["top"] < arr[j]["top"] \ and arr[j + 1]["page_number"] == arr[j]["page_number"]: tmp = arr[j] @@ -264,13 +264,13 @@ class RAGFlowPdfParser: for b in self.boxes: if b.get("layout_type", "") != "table": continue - ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3) + ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3) if ii is not None: b["R"] = ii b["R_top"] = rows[ii]["top"] b["R_bott"] = rows[ii]["bottom"] - ii = Recognizer.find_overlapped_with_threashold( + ii = Recognizer.find_overlapped_with_threshold( b, headers, thr=0.3) if ii is not None: b["H_top"] = headers[ii]["top"] @@ -285,7 +285,7 @@ class RAGFlowPdfParser: b["C_left"] = clmns[ii]["x0"] b["C_right"] = clmns[ii]["x1"] - ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3) + ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3) if ii is not None: b["H_top"] = spans[ii]["top"] b["H_bott"] = spans[ii]["bottom"] diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index 512617190..46be451c6 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -106,7 +106,7 @@ class LayoutRecognizer(Recognizer): bxs.pop(i) continue - ii = self.find_overlapped_with_threashold(bxs[i], lts_, + ii = self.find_overlapped_with_threshold(bxs[i], lts_, thr=0.4) if ii is None: # belong to nothing bxs[i]["layout_type"] = "" diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index 6911d8698..adb05891f 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -52,20 +52,20 @@ class Recognizer: self.label_list = label_list @staticmethod - def sort_Y_firstly(arr, threashold): + def sort_Y_firstly(arr, threshold): def cmp(c1, c2): diff = c1["top"] - c2["top"] - if abs(diff) < threashold: + if abs(diff) < threshold: diff = c1["x0"] - c2["x0"] return diff arr = sorted(arr, key=cmp_to_key(cmp)) return arr @staticmethod - def sort_X_firstly(arr, threashold): + def sort_X_firstly(arr, threshold): def cmp(c1, c2): diff = c1["x0"] - c2["x0"] - if abs(diff) < threashold: + if abs(diff) < threshold: diff = c1["top"] - c2["top"] return diff arr = sorted(arr, key=cmp_to_key(cmp)) @@ -239,15 +239,15 @@ class Recognizer: e -= 1 break - max_overlaped_i, max_overlaped = None, 0 + max_overlapped_i, max_overlapped = None, 0 for i in range(s, e): ov = Recognizer.overlapped_area(bxs[i], box) - if ov <= max_overlaped: + if ov <= max_overlapped: continue - max_overlaped_i = i - max_overlaped = ov + max_overlapped_i = i + max_overlapped = ov - return max_overlaped_i + return max_overlapped_i @staticmethod def find_horizontally_tightest_fit(box, boxes): @@ -264,7 +264,7 @@ class Recognizer: return min_i @staticmethod - def find_overlapped_with_threashold(box, boxes, thr=0.3): + def find_overlapped_with_threshold(box, boxes, thr=0.3): if not boxes: return max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 diff --git a/deepdoc/vision/t_recognizer.py b/deepdoc/vision/t_recognizer.py index 1db3356a9..264014c86 100644 --- a/deepdoc/vision/t_recognizer.py +++ b/deepdoc/vision/t_recognizer.py @@ -84,13 +84,13 @@ def get_table_html(img, tb_cpns, ocr): clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5) for b in boxes: - ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3) + ii = LayoutRecognizer.find_overlapped_with_threshold(b, rows, thr=0.3) if ii is not None: b["R"] = ii b["R_top"] = rows[ii]["top"] b["R_bott"] = rows[ii]["bottom"] - ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3) + ii = LayoutRecognizer.find_overlapped_with_threshold(b, headers, thr=0.3) if ii is not None: b["H_top"] = headers[ii]["top"] b["H_bott"] = headers[ii]["bottom"] @@ -104,7 +104,7 @@ def get_table_html(img, tb_cpns, ocr): b["C_left"] = clmns[ii]["x0"] b["C_right"] = clmns[ii]["x1"] - ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3) + ii = LayoutRecognizer.find_overlapped_with_threshold(b, spans, thr=0.3) if ii is not None: b["H_top"] = spans[ii]["top"] b["H_bott"] = spans[ii]["bottom"] diff --git a/rag/app/naive.py b/rag/app/naive.py index 551b42b6f..809da121d 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -29,7 +29,7 @@ from tika import parser from api.db import LLMType from api.db.services.llm_service import LLMBundle from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser -from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper +from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table @@ -379,7 +379,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tables = Docx()(filename, binary) if vision_model: - figures_data = vision_figure_parser_figure_data_wraper(sections) + figures_data = vision_figure_parser_figure_data_wrapper(sections) try: docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) boosted_figures = docx_vision_parser(callback=callback) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index fc5cc575b..ed76396a3 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -21,7 +21,7 @@ import sys import threading import time -from api.utils.log_utils import initRootLogger, get_project_base_directory +from api.utils.log_utils import init_root_logger, get_project_base_directory from graphrag.general.index import run_graphrag from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache from rag.prompts import keyword_extraction, question_proposal, content_tagging @@ -773,5 +773,5 @@ async def main(): if __name__ == "__main__": faulthandler.enable() - initRootLogger(CONSUMER_NAME) + init_root_logger(CONSUMER_NAME) trio.run(main)