diff --git a/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py b/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py index 903bd30286..3c10205927 100644 --- a/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py +++ b/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py @@ -46,6 +46,7 @@ from services.errors.llm import InvokeRateLimitError from services.rag_pipeline.pipeline_generate_service import PipelineGenerateService from services.rag_pipeline.rag_pipeline import RagPipelineService from services.rag_pipeline.rag_pipeline_manage_service import RagPipelineManageService +from services.rag_pipeline.rag_pipeline_transform_service import RagPipelineTransformService logger = logging.getLogger(__name__) @@ -946,6 +947,16 @@ class RagPipelineWorkflowLastRunApi(Resource): if node_exec is None: raise NotFound("last run not found") return node_exec + +class RagPipelineTransformApi(Resource): + @setup_required + @login_required + @account_initialization_required + def post(self, dataset_id): + dataset_id = str(dataset_id) + rag_pipeline_transform_service = RagPipelineTransformService() + rag_pipeline_transform_service.transform_dataset(dataset_id) + return {"message": "success"} api.add_resource( @@ -1056,3 +1067,7 @@ api.add_resource( RagPipelineWorkflowLastRunApi, "/rag/pipelines//workflows/draft/nodes//last-run", ) +api.add_resource( + RagPipelineTransformApi, + "/rag/pipelines/transform/datasets/", +) \ No newline at end of file diff --git a/api/services/entities/knowledge_entities/rag_pipeline_entities.py b/api/services/entities/knowledge_entities/rag_pipeline_entities.py index 620fb2426a..6c1855260e 100644 --- a/api/services/entities/knowledge_entities/rag_pipeline_entities.py +++ b/api/services/entities/knowledge_entities/rag_pipeline_entities.py @@ -87,8 +87,8 @@ class RetrievalSetting(BaseModel): top_k: int score_threshold: Optional[float] = 0.5 score_threshold_enabled: bool = False - reranking_mode: str = "reranking_model" - reranking_enable: bool = True + reranking_mode: Optional[str] = "reranking_model" + reranking_enable: Optional[bool] = True reranking_model: Optional[RerankingModelConfig] = None weights: Optional[WeightedScoreConfig] = None diff --git a/api/services/rag_pipeline/rag_pipeline.py b/api/services/rag_pipeline/rag_pipeline.py index 9c432208e6..4c59610e79 100644 --- a/api/services/rag_pipeline/rag_pipeline.py +++ b/api/services/rag_pipeline/rag_pipeline.py @@ -54,7 +54,7 @@ from core.workflow.workflow_entry import WorkflowEntry from extensions.ext_database import db from libs.infinite_scroll_pagination import InfiniteScrollPagination from models.account import Account -from models.dataset import Document, Pipeline, PipelineCustomizedTemplate # type: ignore +from models.dataset import Dataset, Document, Pipeline, PipelineCustomizedTemplate # type: ignore from models.enums import WorkflowRunTriggeredFrom from models.model import EndUser from models.workflow import ( @@ -72,7 +72,7 @@ from services.entities.knowledge_entities.rag_pipeline_entities import ( ) from services.errors.app import WorkflowHashNotEqualError from services.rag_pipeline.pipeline_template.pipeline_template_factory import PipelineTemplateRetrievalFactory -from services.workflow_draft_variable_service import DraftVarLoader, DraftVariableSaver +from services.workflow_draft_variable_service import DraftVariableSaver, DraftVarLoader logger = logging.getLogger(__name__) diff --git a/api/services/rag_pipeline/rag_pipeline_dsl_service.py b/api/services/rag_pipeline/rag_pipeline_dsl_service.py index 06dfe7bf83..c130799a3d 100644 --- a/api/services/rag_pipeline/rag_pipeline_dsl_service.py +++ b/api/services/rag_pipeline/rag_pipeline_dsl_service.py @@ -233,6 +233,7 @@ class RagPipelineDslService: ) dataset = pipeline.dataset if dataset: + self._session.merge(dataset) dataset_name = dataset.name # If major version mismatch, store import info in Redis @@ -291,7 +292,7 @@ class RagPipelineDslService: if not dataset: dataset = Dataset( tenant_id=account.current_tenant_id, - name=name, + name=name + datetime.now(UTC).strftime("%Y%m%d%H%M%S%f"), description=description, icon_info={ "type": icon_type, diff --git a/api/services/rag_pipeline/rag_pipeline_transform_service.py b/api/services/rag_pipeline/rag_pipeline_transform_service.py new file mode 100644 index 0000000000..4fce28990b --- /dev/null +++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py @@ -0,0 +1,210 @@ +import json +from datetime import UTC, datetime +from pathlib import Path +from uuid import uuid4 + +import yaml +from flask_login import current_user + +from constants import DOCUMENT_EXTENSIONS +from extensions.ext_database import db +from factories import variable_factory +from models.dataset import Dataset, Pipeline +from models.workflow import Workflow, WorkflowType +from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting + + +class RagPipelineTransformService: + + + def transform_dataset(self, dataset_id: str): + dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() + if not dataset: + raise ValueError("Dataset not found") + if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline": + return + if dataset.provider != "vendor": + raise ValueError("External dataset is not supported") + datasource_type = dataset.data_source_type + indexing_technique = dataset.indexing_technique + + if not datasource_type and not indexing_technique: + return + doc_form = dataset.doc_form + if not doc_form: + return + retrieval_model = dataset.retrieval_model + pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique) + # Extract app data + workflow_data = pipeline_yaml.get("workflow") + graph = workflow_data.get("graph", {}) + nodes = graph.get("nodes", []) + new_nodes = [] + + for node in nodes: + if node.get("data", {}).get("type") == "datasource" and node.get("data", {}).get("provider_type") == "local_file": + node = self._deal_file_extensions(node) + if node.get("data", {}).get("type") == "knowledge-index": + node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node) + new_nodes.append(node) + if new_nodes: + graph["nodes"] = new_nodes + workflow_data["graph"] = graph + pipeline_yaml["workflow"] = workflow_data + # create pipeline + pipeline = self._create_pipeline(pipeline_yaml) + + # save chunk structure to dataset + if doc_form == "hierarchical_model": + dataset.chunk_structure = "hierarchical_model" + elif doc_form == "text_model": + dataset.chunk_structure = "text_model" + else: + raise ValueError("Unsupported doc form") + + dataset.runtime_mode = "rag_pipeline" + dataset.pipeline_id = pipeline.id + + db.session.commit() + + def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str): + if doc_form == "text_model": + match datasource_type: + case "upload_file": + if indexing_technique == "high_quality": + # get graph from transform.file-general-high-quality.yml + with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + if indexing_technique == "economy": + # get graph from transform.file-general-economy.yml + with open(f"{Path(__file__).parent}/transform/file-general-economy.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case "notion_import": + if indexing_technique == "high_quality": + # get graph from transform.notion-general-high-quality.yml + with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + if indexing_technique == "economy": + # get graph from transform.notion-general-economy.yml + with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case "website_crawl": + if indexing_technique == "high_quality": + # get graph from transform.website-crawl-general-high-quality.yml + with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + if indexing_technique == "economy": + # get graph from transform.website-crawl-general-economy.yml + with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case _: + raise ValueError("Unsupported datasource type") + elif doc_form == "hierarchical_model": + match datasource_type: + case "upload_file": + # get graph from transform.file-parent-child.yml + with open(f"{Path(__file__).parent}/transform/file-parent-child.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case "notion_import": + # get graph from transform.notion-parent-child.yml + with open(f"{Path(__file__).parent}/transform/notion-parent-child.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case "website_crawl": + # get graph from transform.website-crawl-parent-child.yml + with open(f"{Path(__file__).parent}/transform/website-crawl-parent-child.yml", "r") as f: + pipeline_yaml = yaml.safe_load(f) + case _: + raise ValueError("Unsupported datasource type") + else: + raise ValueError("Unsupported doc form") + return pipeline_yaml + + def _deal_file_extensions(self, node: dict): + file_extensions = node.get("data", {}).get("fileExtensions", []) + if not file_extensions: + return node + file_extensions = [file_extension.lower() for file_extension in file_extensions] + node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS + return node + + def _deal_knowledge_index(self, dataset: Dataset, doc_form: str, indexing_technique: str, retrieval_model: dict, node: dict): + knowledge_configuration = node.get("data", {}) + knowledge_configuration = KnowledgeConfiguration(**knowledge_configuration) + + if indexing_technique == "high_quality": + knowledge_configuration.embedding_model = dataset.embedding_model + knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider + retrieval_setting = RetrievalSetting(**retrieval_model) + if indexing_technique == "economy": + retrieval_setting.search_method = "keyword_search" + knowledge_configuration.retrieval_model = retrieval_setting + + return knowledge_configuration.model_dump() + + def _create_pipeline( + self, + data: dict, + ) -> Pipeline: + """Create a new app or update an existing one.""" + pipeline_data = data.get("rag_pipeline", {}) + # Initialize pipeline based on mode + workflow_data = data.get("workflow") + if not workflow_data or not isinstance(workflow_data, dict): + raise ValueError("Missing workflow data for rag pipeline") + + environment_variables_list = workflow_data.get("environment_variables", []) + environment_variables = [ + variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list + ] + conversation_variables_list = workflow_data.get("conversation_variables", []) + conversation_variables = [ + variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list + ] + rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", []) + + graph = workflow_data.get("graph", {}) + + # Create new app + pipeline = Pipeline() + pipeline.id = str(uuid4()) + pipeline.tenant_id = current_user.current_tenant_id + pipeline.name = pipeline_data.get("name", "") + pipeline.description = pipeline_data.get("description", "") + pipeline.created_by = current_user.id + pipeline.updated_by = current_user.id + pipeline.is_published = True + pipeline.is_public = True + + db.session.add(pipeline) + db.session.flush() + # create draft workflow + draft_workflow = Workflow( + tenant_id=pipeline.tenant_id, + app_id=pipeline.id, + features="{}", + type=WorkflowType.RAG_PIPELINE.value, + version="draft", + graph=json.dumps(graph), + created_by=current_user.id, + environment_variables=environment_variables, + conversation_variables=conversation_variables, + rag_pipeline_variables=rag_pipeline_variables_list, + ) + published_workflow = Workflow( + tenant_id=pipeline.tenant_id, + app_id=pipeline.id, + features="{}", + type=WorkflowType.RAG_PIPELINE.value, + version=str(datetime.now(UTC).replace(tzinfo=None)), + graph=json.dumps(graph), + created_by=current_user.id, + environment_variables=environment_variables, + conversation_variables=conversation_variables, + rag_pipeline_variables=rag_pipeline_variables_list, + ) + db.session.add(draft_workflow) + db.session.add(published_workflow) + db.session.flush() + pipeline.workflow_id = published_workflow.id + db.session.add(pipeline) + return pipeline diff --git a/api/services/rag_pipeline/transform/file-general-economy.yml b/api/services/rag_pipeline/transform/file-general-economy.yml new file mode 100644 index 0000000000..f88c0f3cdc --- /dev/null +++ b/api/services/rag_pipeline/transform/file-general-economy.yml @@ -0,0 +1,708 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '' + icon_type: emoji + name: file-general-economy +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: if-else + id: 1752479895761-source-1752481129417-target + source: '1752479895761' + sourceHandle: source + target: '1752481129417' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: tool + id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target + source: '1752481129417' + sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + target: '1752480460682' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: document-extractor + id: 1752481129417-false-1752481112180-target + source: '1752481129417' + sourceHandle: 'false' + target: '1752481112180' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: variable-aggregator + id: 1752480460682-source-1752482022496-target + source: '1752480460682' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: document-extractor + targetType: variable-aggregator + id: 1752481112180-source-1752482022496-target + source: '1752481112180' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752482022496-source-1752482151668-target + source: '1752482022496' + sourceHandle: source + target: '1752482151668' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752482151668-source-1752477924228-target + source: '1752482151668' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752482151668' + - result + indexing_technique: economy + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: keyword_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: true + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 1076.4656678451215 + y: 281.3910724383104 + positionAbsolute: + x: 1076.4656678451215 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: File + datasource_name: upload-file + datasource_parameters: {} + fileExtensions: + - txt + - markdown + - mdx + - pdf + - html + - xlsx + - xls + - vtt + - properties + - doc + - docx + - csv + - eml + - msg + - pptx + - xml + - epub + - ppt + - md + plugin_id: langgenius/file + provider_name: file + provider_type: local_file + selected: false + title: File + type: datasource + height: 52 + id: '1752479895761' + position: + x: -839.8603427660498 + y: 251.3910724383104 + positionAbsolute: + x: -839.8603427660498 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + documents: + description: the documents extracted from the file + items: + type: object + type: array + images: + description: The images extracted from the file + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, + jpg, jpeg) + zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg) + label: + en_US: file + ja_JP: file + pt_BR: file + zh_Hans: file + llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx, + png, jpg, jpeg) + max: null + min: null + name: file + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: file + params: + file: '' + provider_id: langgenius/dify_extractor/dify_extractor + provider_name: langgenius/dify_extractor/dify_extractor + provider_type: builtin + selected: false + title: Dify文本提取器 + tool_configurations: {} + tool_description: Dify Extractor + tool_label: Dify文本提取器 + tool_name: dify_extractor + tool_parameters: + file: + type: variable + value: + - '1752479895761' + - file + type: tool + height: 52 + id: '1752480460682' + position: + x: -108.28652292656551 + y: 281.3910724383104 + positionAbsolute: + x: -108.28652292656551 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_array_file: false + selected: false + title: 文档提取器 + type: document-extractor + variable_selector: + - '1752479895761' + - file + height: 90 + id: '1752481112180' + position: + x: -108.28652292656551 + y: 390.6576481692478 + positionAbsolute: + x: -108.28652292656551 + y: 390.6576481692478 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + cases: + - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + conditions: + - comparison_operator: is + id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d + value: .xlsx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: d0e88f5e-dfe3-4bae-af0c-dbec267500de + value: .xls + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d + value: .md + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73 + value: .markdown + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: f9541513-1e71-4dc1-9db5-35dc84a39e3c + value: .mdx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d + value: .html + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1 + value: .htm + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2 + value: .docx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8 + value: .csv + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602 + value: .txt + varType: file + variable_selector: + - '1752479895761' + - file + - extension + id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + logical_operator: or + selected: false + title: 条件分支 + type: if-else + height: 358 + id: '1752481129417' + position: + x: -489.57009543377865 + y: 251.3910724383104 + positionAbsolute: + x: -489.57009543377865 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + advanced_settings: + group_enabled: false + groups: + - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7 + group_name: Group1 + output_type: string + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + height: 129 + id: '1752482022496' + position: + x: 319.441649575055 + y: 281.3910724383104 + positionAbsolute: + x: 319.441649575055 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752482022496.output#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752482151668' + position: + x: 693.5300771507484 + y: 281.3910724383104 + positionAbsolute: + x: 693.5300771507484 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: 701.4999626224237 + y: 128.33739021504016 + zoom: 0.48941689643726966 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: null + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Chunk overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: number + unit: null + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/file-general-high-quality.yml b/api/services/rag_pipeline/transform/file-general-high-quality.yml new file mode 100644 index 0000000000..42174d1986 --- /dev/null +++ b/api/services/rag_pipeline/transform/file-general-high-quality.yml @@ -0,0 +1,708 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '#FFF4ED' + icon_type: emoji + name: file-general-high-quality +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: if-else + id: 1752479895761-source-1752481129417-target + source: '1752479895761' + sourceHandle: source + target: '1752481129417' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: tool + id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target + source: '1752481129417' + sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + target: '1752480460682' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: document-extractor + id: 1752481129417-false-1752481112180-target + source: '1752481129417' + sourceHandle: 'false' + target: '1752481112180' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: variable-aggregator + id: 1752480460682-source-1752482022496-target + source: '1752480460682' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: document-extractor + targetType: variable-aggregator + id: 1752481112180-source-1752482022496-target + source: '1752481112180' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752482022496-source-1752482151668-target + source: '1752482022496' + sourceHandle: source + target: '1752482151668' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752482151668-source-1752477924228-target + source: '1752482151668' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752482151668' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: false + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 1076.4656678451215 + y: 281.3910724383104 + positionAbsolute: + x: 1076.4656678451215 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: File + datasource_name: upload-file + datasource_parameters: {} + fileExtensions: + - txt + - markdown + - mdx + - pdf + - html + - xlsx + - xls + - vtt + - properties + - doc + - docx + - csv + - eml + - msg + - pptx + - xml + - epub + - ppt + - md + plugin_id: langgenius/file + provider_name: file + provider_type: local_file + selected: false + title: File + type: datasource + height: 52 + id: '1752479895761' + position: + x: -839.8603427660498 + y: 251.3910724383104 + positionAbsolute: + x: -839.8603427660498 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + documents: + description: the documents extracted from the file + items: + type: object + type: array + images: + description: The images extracted from the file + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, + jpg, jpeg) + zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg) + label: + en_US: file + ja_JP: file + pt_BR: file + zh_Hans: file + llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx, + png, jpg, jpeg) + max: null + min: null + name: file + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: file + params: + file: '' + provider_id: langgenius/dify_extractor/dify_extractor + provider_name: langgenius/dify_extractor/dify_extractor + provider_type: builtin + selected: false + title: Dify文本提取器 + tool_configurations: {} + tool_description: Dify Extractor + tool_label: Dify文本提取器 + tool_name: dify_extractor + tool_parameters: + file: + type: variable + value: + - '1752479895761' + - file + type: tool + height: 52 + id: '1752480460682' + position: + x: -108.28652292656551 + y: 281.3910724383104 + positionAbsolute: + x: -108.28652292656551 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_array_file: false + selected: false + title: 文档提取器 + type: document-extractor + variable_selector: + - '1752479895761' + - file + height: 90 + id: '1752481112180' + position: + x: -108.28652292656551 + y: 390.6576481692478 + positionAbsolute: + x: -108.28652292656551 + y: 390.6576481692478 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + cases: + - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + conditions: + - comparison_operator: is + id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d + value: .xlsx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: d0e88f5e-dfe3-4bae-af0c-dbec267500de + value: .xls + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d + value: .md + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73 + value: .markdown + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: f9541513-1e71-4dc1-9db5-35dc84a39e3c + value: .mdx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d + value: .html + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1 + value: .htm + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2 + value: .docx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8 + value: .csv + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602 + value: .txt + varType: file + variable_selector: + - '1752479895761' + - file + - extension + id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + logical_operator: or + selected: false + title: 条件分支 + type: if-else + height: 358 + id: '1752481129417' + position: + x: -489.57009543377865 + y: 251.3910724383104 + positionAbsolute: + x: -489.57009543377865 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + advanced_settings: + group_enabled: false + groups: + - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7 + group_name: Group1 + output_type: string + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + height: 129 + id: '1752482022496' + position: + x: 319.441649575055 + y: 281.3910724383104 + positionAbsolute: + x: 319.441649575055 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752482022496.output#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752482151668' + position: + x: 693.5300771507484 + y: 281.3910724383104 + positionAbsolute: + x: 693.5300771507484 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: 701.4999626224237 + y: 128.33739021504016 + zoom: 0.48941689643726966 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: null + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Chunk overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: number + unit: null + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/file-parentchild.yml b/api/services/rag_pipeline/transform/file-parentchild.yml new file mode 100644 index 0000000000..4135ab3aa4 --- /dev/null +++ b/api/services/rag_pipeline/transform/file-parentchild.yml @@ -0,0 +1,816 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510 +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '#FFF4ED' + icon_type: emoji + name: file-parentchild +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: if-else + id: 1752479895761-source-1752481129417-target + source: '1752479895761' + sourceHandle: source + target: '1752481129417' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: tool + id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target + source: '1752481129417' + sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + target: '1752480460682' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: if-else + targetType: document-extractor + id: 1752481129417-false-1752481112180-target + source: '1752481129417' + sourceHandle: 'false' + target: '1752481112180' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: variable-aggregator + id: 1752480460682-source-1752482022496-target + source: '1752480460682' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: document-extractor + targetType: variable-aggregator + id: 1752481112180-source-1752482022496-target + source: '1752481112180' + sourceHandle: source + target: '1752482022496' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752482022496-source-1752575473519-target + source: '1752482022496' + sourceHandle: source + target: '1752575473519' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752575473519-source-1752477924228-target + source: '1752575473519' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: hierarchical_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752575473519' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: false + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 994.3774545394483 + y: 281.3910724383104 + positionAbsolute: + x: 994.3774545394483 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: File + datasource_name: upload-file + datasource_parameters: {} + fileExtensions: + - txt + - markdown + - mdx + - pdf + - html + - xlsx + - xls + - vtt + - properties + - doc + - docx + - csv + - eml + - msg + - pptx + - xml + - epub + - ppt + - md + plugin_id: langgenius/file + provider_name: file + provider_type: local_file + selected: false + title: File + type: datasource + height: 52 + id: '1752479895761' + position: + x: -839.8603427660498 + y: 251.3910724383104 + positionAbsolute: + x: -839.8603427660498 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + documents: + description: the documents extracted from the file + items: + type: object + type: array + images: + description: The images extracted from the file + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, + jpeg) + pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, + jpg, jpeg) + zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg) + label: + en_US: file + ja_JP: file + pt_BR: file + zh_Hans: file + llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx, + png, jpg, jpeg) + max: null + min: null + name: file + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: file + params: + file: '' + provider_id: langgenius/dify_extractor/dify_extractor + provider_name: langgenius/dify_extractor/dify_extractor + provider_type: builtin + selected: false + title: Dify文本提取器 + tool_configurations: {} + tool_description: Dify Extractor + tool_label: Dify文本提取器 + tool_name: dify_extractor + tool_parameters: + file: + type: variable + value: + - '1752479895761' + - file + type: tool + height: 52 + id: '1752480460682' + position: + x: -108.28652292656551 + y: 281.3910724383104 + positionAbsolute: + x: -108.28652292656551 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_array_file: false + selected: false + title: 文档提取器 + type: document-extractor + variable_selector: + - '1752479895761' + - file + height: 90 + id: '1752481112180' + position: + x: -108.28652292656551 + y: 390.6576481692478 + positionAbsolute: + x: -108.28652292656551 + y: 390.6576481692478 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + cases: + - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + conditions: + - comparison_operator: is + id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d + value: .xlsx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: d0e88f5e-dfe3-4bae-af0c-dbec267500de + value: .xls + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d + value: .md + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73 + value: .markdown + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: f9541513-1e71-4dc1-9db5-35dc84a39e3c + value: .mdx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d + value: .html + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1 + value: .htm + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2 + value: .docx + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8 + value: .csv + varType: file + variable_selector: + - '1752479895761' + - file + - extension + - comparison_operator: is + id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602 + value: .txt + varType: file + variable_selector: + - '1752479895761' + - file + - extension + id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7 + logical_operator: or + selected: false + title: 条件分支 + type: if-else + height: 358 + id: '1752481129417' + position: + x: -512.2335487893622 + y: 251.3910724383104 + positionAbsolute: + x: -512.2335487893622 + y: 251.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + advanced_settings: + group_enabled: false + groups: + - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7 + group_name: Group1 + output_type: string + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752481112180' + - text + - - '1752480460682' + - text + height: 129 + id: '1752482022496' + position: + x: 319.441649575055 + y: 281.3910724383104 + positionAbsolute: + x: 319.441649575055 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: Parent child chunks result + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input text + ja_JP: Input text + pt_BR: Input text + zh_Hans: 输入文本 + llm_description: The text you want to chunk. + max: null + min: null + name: input_text + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: 1024 + form: llm + human_description: + en_US: Maximum length for chunking + ja_JP: Maximum length for chunking + pt_BR: Comprimento máximo para divisão + zh_Hans: 用于分块的最大长度 + label: + en_US: Maximum Length + ja_JP: Maximum Length + pt_BR: Comprimento Máximo + zh_Hans: 最大长度 + llm_description: Maximum length allowed per chunk + max: null + min: null + name: max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: ' + + + ' + form: llm + human_description: + en_US: Separator used for chunking + ja_JP: Separator used for chunking + pt_BR: Separador usado para divisão + zh_Hans: 用于分块的分隔符 + label: + en_US: Chunk Separator + ja_JP: Chunk Separator + pt_BR: Separador de Divisão + zh_Hans: 分块分隔符 + llm_description: The separator used to split chunks + max: null + min: null + name: separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: 512 + form: llm + human_description: + en_US: Maximum length for subchunking + ja_JP: Maximum length for subchunking + pt_BR: Comprimento máximo para subdivisão + zh_Hans: 用于子分块的最大长度 + label: + en_US: Subchunk Maximum Length + ja_JP: Subchunk Maximum Length + pt_BR: Comprimento Máximo de Subdivisão + zh_Hans: 子分块最大长度 + llm_description: Maximum length allowed per subchunk + max: null + min: null + name: subchunk_max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: '. ' + form: llm + human_description: + en_US: Separator used for subchunking + ja_JP: Separator used for subchunking + pt_BR: Separador usado para subdivisão + zh_Hans: 用于子分块的分隔符 + label: + en_US: Subchunk Separator + ja_JP: Subchunk Separator + pt_BR: Separador de Subdivisão + zh_Hans: 子分块分隔符 + llm_description: The separator used to split subchunks + max: null + min: null + name: subchunk_separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: paragraph + form: llm + human_description: + en_US: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + ja_JP: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + pt_BR: Dividir texto em parágrafos com base no separador e no comprimento + máximo do bloco, usando o texto dividido como bloco pai ou documento + completo como bloco pai e diretamente recuperá-lo. + zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。 + label: + en_US: Parent Mode + ja_JP: Parent Mode + pt_BR: Modo Pai + zh_Hans: 父块模式 + llm_description: Split text into paragraphs based on separator and maximum + chunk length, using split text as parent block or entire document as parent + block and directly retrieve. + max: null + min: null + name: parent_mode + options: + - icon: '' + label: + en_US: Paragraph + ja_JP: Paragraph + pt_BR: Parágrafo + zh_Hans: 段落 + value: paragraph + - icon: '' + label: + en_US: Full Document + ja_JP: Full Document + pt_BR: Documento Completo + zh_Hans: 全文 + value: full_doc + placeholder: null + precision: null + required: true + scope: null + template: null + type: select + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove extra spaces in the text + ja_JP: Whether to remove extra spaces in the text + pt_BR: Se deve remover espaços extras no texto + zh_Hans: 是否移除文本中的多余空格 + label: + en_US: Remove Extra Spaces + ja_JP: Remove Extra Spaces + pt_BR: Remover Espaços Extras + zh_Hans: 移除多余空格 + llm_description: Whether to remove extra spaces in the text + max: null + min: null + name: remove_extra_spaces + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove URLs and emails in the text + ja_JP: Whether to remove URLs and emails in the text + pt_BR: Se deve remover URLs e e-mails no texto + zh_Hans: 是否移除文本中的URL和电子邮件地址 + label: + en_US: Remove URLs and Emails + ja_JP: Remove URLs and Emails + pt_BR: Remover URLs e E-mails + zh_Hans: 移除URL和电子邮件地址 + llm_description: Whether to remove URLs and emails in the text + max: null + min: null + name: remove_urls_emails + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + input_text: '' + max_length: '' + parent_mode: '' + remove_extra_spaces: '' + remove_urls_emails: '' + separator: '' + subchunk_max_length: '' + subchunk_separator: '' + provider_id: langgenius/parent_child_chunk/parent_child_chunk + provider_name: langgenius/parent_child_chunk/parent_child_chunk + provider_type: builtin + selected: false + title: 父子分块处理器 + tool_configurations: {} + tool_description: 将文档处理为父子分块结构 + tool_label: 父子分块处理器 + tool_name: parent_child_chunk + tool_parameters: + input_text: + type: mixed + value: '{{#1752482022496.output#}}' + max_length: + type: variable + value: + - rag + - shared + - max_chunk_length + parent_mode: + type: variable + value: + - rag + - shared + - parent_mode + remove_extra_spaces: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + remove_urls_emails: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + separator: + type: mixed + value: '{{#rag.shared.delimiter#}}' + subchunk_max_length: + type: variable + value: + - rag + - shared + - child_max_chunk_length + subchunk_separator: + type: mixed + value: '{{#rag.shared.child_delimiter#}}' + type: tool + height: 52 + id: '1752575473519' + position: + x: 637.9241611063885 + y: 281.3910724383104 + positionAbsolute: + x: 637.9241611063885 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: 948.6766333808323 + y: -102.06757184183238 + zoom: 0.8375774577380971 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n\n + label: Delimiter + max_length: 256 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 1024 + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n + label: Child delimiter + max_length: 256 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: child_delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 512 + label: Child max chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: child_max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: paragraph + label: Parent mode + max_length: 48 + options: + - full_doc + - paragraph + placeholder: null + required: true + tooltips: null + type: select + unit: null + variable: parent_mode + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/file_general_economy.json b/api/services/rag_pipeline/transform/file_general_economy.json new file mode 100644 index 0000000000..4d26d2be9c --- /dev/null +++ b/api/services/rag_pipeline/transform/file_general_economy.json @@ -0,0 +1 @@ +{"nodes": [{"id": "1752477924228", "type": "custom", "data": {"index_chunk_variable_selector": ["1752482151668", "result"], "keyword_number": 10, "retrieval_model": {"top_k": 3, "score_threshold_enabled": false, "score_threshold": 0.5, "search_method": "keyword_search", "vector_setting": {"embedding_provider_name": "langgenius/openai/openai", "embedding_model_name": "text-embedding-ada-002"}}, "type": "knowledge-index", "title": "\u77e5\u8bc6\u5e93", "selected": false, "chunk_structure": "text_model", "indexing_technique": "economy", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai"}, "position": {"x": 1076.4656678451215, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1076.4656678451215, "y": 281.3910724383104}, "width": 242, "height": 114, "selected": true}, {"id": "1752479895761", "type": "custom", "data": {"datasource_parameters": {}, "datasource_configurations": {}, "type": "datasource", "title": "File", "plugin_id": "langgenius/file", "provider_type": "local_file", "provider_name": "file", "datasource_name": "upload-file", "datasource_label": "File", "selected": false, "fileExtensions": ["txt", "markdown", "mdx", "pdf", "html", "xlsx", "xls", "vtt", "properties", "doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub", "ppt", "md"]}, "position": {"x": -839.8603427660498, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -839.8603427660498, "y": 251.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752480460682", "type": "custom", "data": {"tool_parameters": {"file": {"type": "variable", "value": ["1752479895761", "file"]}}, "tool_configurations": {}, "type": "tool", "title": "Dify\u6587\u672c\u63d0\u53d6\u5668", "provider_id": "langgenius/dify_extractor/dify_extractor", "provider_type": "builtin", "provider_name": "langgenius/dify_extractor/dify_extractor", "tool_name": "dify_extractor", "tool_label": "Dify\u6587\u672c\u63d0\u53d6\u5668", "tool_description": "Dify Extractor", "is_team_authorization": true, "output_schema": {"properties": {"documents": {"description": "the documents extracted from the file", "items": {"type": "object"}, "type": "array"}, "images": {"description": "The images extracted from the file", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"name": "file", "label": {"en_US": "file", "zh_Hans": "file", "pt_BR": "file", "ja_JP": "file"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "file", "human_description": {"en_US": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "zh_Hans": "\u7528\u4e8e\u89e3\u6790\u7684\u6587\u4ef6(\u652f\u6301 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "pt_BR": "o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "ja_JP": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}, "form": "llm", "llm_description": "the file to be parsed (support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}], "params": {"file": ""}, "selected": false}, "position": {"x": -108.28652292656551, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752481112180", "type": "custom", "data": {"variable_selector": ["1752479895761", "file"], "is_array_file": false, "type": "document-extractor", "title": "\u6587\u6863\u63d0\u53d6\u5668", "selected": false}, "position": {"x": -108.28652292656551, "y": 390.6576481692478}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 390.6576481692478}, "width": 242, "height": 90, "selected": false}, {"id": "1752481129417", "type": "custom", "data": {"cases": [{"id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "case_id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "logical_operator": "or", "conditions": [{"id": "9da88d93-3ff6-463f-abfd-6bcafbf2554d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xlsx"}, {"id": "d0e88f5e-dfe3-4bae-af0c-dbec267500de", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xls"}, {"id": "a957e91e-1ed7-4c6b-9c80-2f0948858f1d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".md"}, {"id": "870c3c39-8d3f-474a-ab8b-9c0ccf53db73", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".markdown"}, {"id": "f9541513-1e71-4dc1-9db5-35dc84a39e3c", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".mdx"}, {"id": "4c7f455b-ac20-40ca-9495-6cc44ffcb35d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".html"}, {"id": "2e12d9c7-8057-4a09-8851-f9fd1d0718d1", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".htm"}, {"id": "73a995a9-d8b9-4aef-89f7-306e2ddcbce2", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".docx"}, {"id": "8a2e8772-0426-458b-a1f9-9eaaec0f27c8", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".csv"}, {"id": "aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".txt"}]}], "type": "if-else", "title": "\u6761\u4ef6\u5206\u652f", "selected": false}, "position": {"x": -489.57009543377865, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -489.57009543377865, "y": 251.3910724383104}, "width": 242, "height": 358, "selected": false}, {"id": "1752482022496", "type": "custom", "data": {"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "type": "variable-aggregator", "title": "\u53d8\u91cf\u805a\u5408\u5668", "selected": false, "advanced_settings": {"group_enabled": false, "groups": [{"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "group_name": "Group1", "groupId": "f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7"}]}}, "position": {"x": 319.441649575055, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 319.441649575055, "y": 281.3910724383104}, "width": 242, "height": 129, "selected": false}, {"id": "1752482151668", "type": "custom", "data": {"tool_parameters": {"input_variable": {"type": "mixed", "value": "{{#1752482022496.output#}}"}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "provider_id": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "provider_name": "langgenius/general_chunk/general_chunk", "tool_name": "general_chunk", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"name": "input_variable", "label": {"en_US": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf", "pt_BR": "Input Variable", "ja_JP": "Input Variable"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "delimiter", "label": {"en_US": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26", "pt_BR": "Delimiter", "ja_JP": "Delimiter"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002", "pt_BR": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks."}, "form": "llm", "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string."}, {"name": "max_chunk_length", "label": {"en_US": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6", "pt_BR": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002", "pt_BR": "The maximum chunk length.", "ja_JP": "The maximum chunk length."}, "form": "llm", "llm_description": "The maximum chunk length, the format of the chunk size must be an integer."}, {"name": "chunk_overlap_length", "label": {"en_US": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6", "pt_BR": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002", "pt_BR": "The chunk overlap length.", "ja_JP": "The chunk overlap length."}, "form": "llm", "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer."}, {"name": "replace_consecutive_spaces_newlines_tabs", "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs"}, "form": "llm", "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean."}, {"name": "delete_all_urls_and_email_addresses", "label": {"en_US": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses"}, "form": "llm", "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean."}], "params": {"input_variable": "", "delimiter": "", "max_chunk_length": "", "chunk_overlap_length": "", "replace_consecutive_spaces_newlines_tabs": "", "delete_all_urls_and_email_addresses": ""}, "selected": false}, "position": {"x": 693.5300771507484, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 693.5300771507484, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}], "edges": [{"id": "1752479895761-source-1752481129417-target", "type": "custom", "source": "1752479895761", "sourceHandle": "source", "target": "1752481129417", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "if-else", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target", "type": "custom", "source": "1752481129417", "target": "1752480460682", "sourceHandle": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "tool", "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-false-1752481112180-target", "type": "custom", "source": "1752481129417", "target": "1752481112180", "sourceHandle": "false", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "document-extractor", "isInLoop": false}, "zIndex": 0}, {"id": "1752480460682-source-1752482022496-target", "type": "custom", "source": "1752480460682", "sourceHandle": "source", "target": "1752482022496", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "variable-aggregator", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481112180-source-1752482022496-target", "type": "custom", "source": "1752481112180", "target": "1752482022496", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "document-extractor", "targetType": "variable-aggregator", "isInLoop": false}, "zIndex": 0}, {"id": "1752482022496-source-1752482151668-target", "type": "custom", "source": "1752482022496", "sourceHandle": "source", "target": "1752482151668", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752482151668-source-1752477924228-target", "type": "custom", "source": "1752482151668", "sourceHandle": "source", "target": "1752477924228", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInIteration": false, "isInLoop": false}, "zIndex": 0}], "viewport": {"x": 701.4999626224237, "y": 128.33739021504016, "zoom": 0.48941689643726966}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/file_general_high_quality.json b/api/services/rag_pipeline/transform/file_general_high_quality.json new file mode 100644 index 0000000000..82ac85ff41 --- /dev/null +++ b/api/services/rag_pipeline/transform/file_general_high_quality.json @@ -0,0 +1 @@ +{"nodes": [{"id": "1752477924228", "type": "custom", "data": {"index_chunk_variable_selector": ["1752482151668", "result"], "keyword_number": 10, "retrieval_model": {"top_k": 3, "score_threshold_enabled": false, "score_threshold": 0.5, "search_method": "semantic_search", "vector_setting": {"embedding_provider_name": "langgenius/openai/openai", "embedding_model_name": "text-embedding-ada-002"}}, "type": "knowledge-index", "title": "\u77e5\u8bc6\u5e93", "selected": false, "chunk_structure": "text_model", "indexing_technique": "high_quality", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai"}, "position": {"x": 1076.4656678451215, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1076.4656678451215, "y": 281.3910724383104}, "width": 242, "height": 114, "selected": false}, {"id": "1752479895761", "type": "custom", "data": {"datasource_parameters": {}, "datasource_configurations": {}, "type": "datasource", "title": "File", "plugin_id": "langgenius/file", "provider_type": "local_file", "provider_name": "file", "datasource_name": "upload-file", "datasource_label": "File", "selected": false, "fileExtensions": ["txt", "markdown", "mdx", "pdf", "html", "xlsx", "xls", "vtt", "properties", "doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub", "ppt", "md"]}, "position": {"x": -839.8603427660498, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -839.8603427660498, "y": 251.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752480460682", "type": "custom", "data": {"tool_parameters": {"file": {"type": "variable", "value": ["1752479895761", "file"]}}, "tool_configurations": {}, "type": "tool", "title": "Dify\u6587\u672c\u63d0\u53d6\u5668", "provider_id": "langgenius/dify_extractor/dify_extractor", "provider_type": "builtin", "provider_name": "langgenius/dify_extractor/dify_extractor", "tool_name": "dify_extractor", "tool_label": "Dify\u6587\u672c\u63d0\u53d6\u5668", "tool_description": "Dify Extractor", "is_team_authorization": true, "output_schema": {"properties": {"documents": {"description": "the documents extracted from the file", "items": {"type": "object"}, "type": "array"}, "images": {"description": "The images extracted from the file", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"name": "file", "label": {"en_US": "file", "zh_Hans": "file", "pt_BR": "file", "ja_JP": "file"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "file", "human_description": {"en_US": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "zh_Hans": "\u7528\u4e8e\u89e3\u6790\u7684\u6587\u4ef6(\u652f\u6301 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "pt_BR": "o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "ja_JP": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}, "form": "llm", "llm_description": "the file to be parsed (support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}], "params": {"file": ""}, "selected": false}, "position": {"x": -108.28652292656551, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752481112180", "type": "custom", "data": {"variable_selector": ["1752479895761", "file"], "is_array_file": false, "type": "document-extractor", "title": "\u6587\u6863\u63d0\u53d6\u5668", "selected": false}, "position": {"x": -108.28652292656551, "y": 390.6576481692478}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 390.6576481692478}, "width": 242, "height": 90, "selected": false}, {"id": "1752481129417", "type": "custom", "data": {"cases": [{"id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "case_id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "logical_operator": "or", "conditions": [{"id": "9da88d93-3ff6-463f-abfd-6bcafbf2554d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xlsx"}, {"id": "d0e88f5e-dfe3-4bae-af0c-dbec267500de", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xls"}, {"id": "a957e91e-1ed7-4c6b-9c80-2f0948858f1d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".md"}, {"id": "870c3c39-8d3f-474a-ab8b-9c0ccf53db73", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".markdown"}, {"id": "f9541513-1e71-4dc1-9db5-35dc84a39e3c", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".mdx"}, {"id": "4c7f455b-ac20-40ca-9495-6cc44ffcb35d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".html"}, {"id": "2e12d9c7-8057-4a09-8851-f9fd1d0718d1", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".htm"}, {"id": "73a995a9-d8b9-4aef-89f7-306e2ddcbce2", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".docx"}, {"id": "8a2e8772-0426-458b-a1f9-9eaaec0f27c8", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".csv"}, {"id": "aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".txt"}]}], "type": "if-else", "title": "\u6761\u4ef6\u5206\u652f", "selected": false}, "position": {"x": -489.57009543377865, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -489.57009543377865, "y": 251.3910724383104}, "width": 242, "height": 358, "selected": true}, {"id": "1752482022496", "type": "custom", "data": {"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "type": "variable-aggregator", "title": "\u53d8\u91cf\u805a\u5408\u5668", "selected": false, "advanced_settings": {"group_enabled": false, "groups": [{"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "group_name": "Group1", "groupId": "f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7"}]}}, "position": {"x": 319.441649575055, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 319.441649575055, "y": 281.3910724383104}, "width": 242, "height": 129, "selected": false}, {"id": "1752482151668", "type": "custom", "data": {"tool_parameters": {"input_variable": {"type": "mixed", "value": "{{#1752482022496.output#}}"}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "provider_id": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "provider_name": "langgenius/general_chunk/general_chunk", "tool_name": "general_chunk", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"name": "input_variable", "label": {"en_US": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf", "pt_BR": "Input Variable", "ja_JP": "Input Variable"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "delimiter", "label": {"en_US": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26", "pt_BR": "Delimiter", "ja_JP": "Delimiter"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002", "pt_BR": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks."}, "form": "llm", "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string."}, {"name": "max_chunk_length", "label": {"en_US": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6", "pt_BR": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002", "pt_BR": "The maximum chunk length.", "ja_JP": "The maximum chunk length."}, "form": "llm", "llm_description": "The maximum chunk length, the format of the chunk size must be an integer."}, {"name": "chunk_overlap_length", "label": {"en_US": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6", "pt_BR": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002", "pt_BR": "The chunk overlap length.", "ja_JP": "The chunk overlap length."}, "form": "llm", "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer."}, {"name": "replace_consecutive_spaces_newlines_tabs", "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs"}, "form": "llm", "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean."}, {"name": "delete_all_urls_and_email_addresses", "label": {"en_US": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses"}, "form": "llm", "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean."}], "params": {"input_variable": "", "delimiter": "", "max_chunk_length": "", "chunk_overlap_length": "", "replace_consecutive_spaces_newlines_tabs": "", "delete_all_urls_and_email_addresses": ""}, "selected": false}, "position": {"x": 693.5300771507484, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 693.5300771507484, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}], "edges": [{"id": "1752479895761-source-1752481129417-target", "type": "custom", "source": "1752479895761", "sourceHandle": "source", "target": "1752481129417", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "if-else", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target", "type": "custom", "source": "1752481129417", "target": "1752480460682", "sourceHandle": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "tool", "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-false-1752481112180-target", "type": "custom", "source": "1752481129417", "target": "1752481112180", "sourceHandle": "false", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "document-extractor", "isInLoop": false}, "zIndex": 0}, {"id": "1752480460682-source-1752482022496-target", "type": "custom", "source": "1752480460682", "sourceHandle": "source", "target": "1752482022496", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "variable-aggregator", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481112180-source-1752482022496-target", "type": "custom", "source": "1752481112180", "target": "1752482022496", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "document-extractor", "targetType": "variable-aggregator", "isInLoop": false}, "zIndex": 0}, {"id": "1752482022496-source-1752482151668-target", "type": "custom", "source": "1752482022496", "sourceHandle": "source", "target": "1752482151668", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752482151668-source-1752477924228-target", "type": "custom", "source": "1752482151668", "sourceHandle": "source", "target": "1752477924228", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInIteration": false, "isInLoop": false}, "zIndex": 0}], "viewport": {"x": 701.4999626224237, "y": 128.33739021504016, "zoom": 0.48941689643726966}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/file_parent_child.json b/api/services/rag_pipeline/transform/file_parent_child.json new file mode 100644 index 0000000000..84710a842c --- /dev/null +++ b/api/services/rag_pipeline/transform/file_parent_child.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "hierarchical_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752575473519", "result"], "indexing_technique": "high_quality", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "semantic_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": false, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 994.3774545394483, "y": 281.3910724383104}, "positionAbsolute": {"x": 994.3774545394483, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "File", "datasource_name": "upload-file", "datasource_parameters": {}, "fileExtensions": ["txt", "markdown", "mdx", "pdf", "html", "xlsx", "xls", "vtt", "properties", "doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub", "ppt", "md"], "plugin_id": "langgenius/file", "provider_name": "file", "provider_type": "local_file", "selected": false, "title": "File", "type": "datasource"}, "height": 52, "id": "1752479895761", "position": {"x": -839.8603427660498, "y": 251.3910724383104}, "positionAbsolute": {"x": -839.8603427660498, "y": 251.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"is_team_authorization": true, "output_schema": {"properties": {"documents": {"description": "the documents extracted from the file", "items": {"type": "object"}, "type": "array"}, "images": {"description": "The images extracted from the file", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "ja_JP": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "pt_BR": "o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "zh_Hans": "\u7528\u4e8e\u89e3\u6790\u7684\u6587\u4ef6(\u652f\u6301 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}, "label": {"en_US": "file", "ja_JP": "file", "pt_BR": "file", "zh_Hans": "file"}, "llm_description": "the file to be parsed (support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "max": null, "min": null, "name": "file", "options": [], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "file"}], "params": {"file": ""}, "provider_id": "langgenius/dify_extractor/dify_extractor", "provider_name": "langgenius/dify_extractor/dify_extractor", "provider_type": "builtin", "selected": false, "title": "Dify\u6587\u672c\u63d0\u53d6\u5668", "tool_configurations": {}, "tool_description": "Dify Extractor", "tool_label": "Dify\u6587\u672c\u63d0\u53d6\u5668", "tool_name": "dify_extractor", "tool_parameters": {"file": {"type": "variable", "value": ["1752479895761", "file"]}}, "type": "tool"}, "height": 52, "id": "1752480460682", "position": {"x": -108.28652292656551, "y": 281.3910724383104}, "positionAbsolute": {"x": -108.28652292656551, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"is_array_file": false, "selected": false, "title": "\u6587\u6863\u63d0\u53d6\u5668", "type": "document-extractor", "variable_selector": ["1752479895761", "file"]}, "height": 90, "id": "1752481112180", "position": {"x": -108.28652292656551, "y": 390.6576481692478}, "positionAbsolute": {"x": -108.28652292656551, "y": 390.6576481692478}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"cases": [{"id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "case_id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "logical_operator": "or", "conditions": [{"comparison_operator": "is", "id": "9da88d93-3ff6-463f-abfd-6bcafbf2554d", "value": ".xlsx", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "d0e88f5e-dfe3-4bae-af0c-dbec267500de", "value": ".xls", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "a957e91e-1ed7-4c6b-9c80-2f0948858f1d", "value": ".md", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "870c3c39-8d3f-474a-ab8b-9c0ccf53db73", "value": ".markdown", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "f9541513-1e71-4dc1-9db5-35dc84a39e3c", "value": ".mdx", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "4c7f455b-ac20-40ca-9495-6cc44ffcb35d", "value": ".html", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "2e12d9c7-8057-4a09-8851-f9fd1d0718d1", "value": ".htm", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "73a995a9-d8b9-4aef-89f7-306e2ddcbce2", "value": ".docx", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "8a2e8772-0426-458b-a1f9-9eaaec0f27c8", "value": ".csv", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}, {"comparison_operator": "is", "id": "aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602", "value": ".txt", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"]}]}], "selected": false, "title": "\u6761\u4ef6\u5206\u652f", "type": "if-else"}, "height": 358, "id": "1752481129417", "position": {"x": -512.2335487893622, "y": 251.3910724383104}, "positionAbsolute": {"x": -512.2335487893622, "y": 251.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"advanced_settings": {"group_enabled": false, "groups": [{"groupId": "f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7", "group_name": "Group1", "output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]]}]}, "output_type": "string", "selected": false, "title": "\u53d8\u91cf\u805a\u5408\u5668", "type": "variable-aggregator", "variables": [["1752481112180", "text"], ["1752480460682", "text"]]}, "height": 129, "id": "1752482022496", "position": {"x": 319.441649575055, "y": 281.3910724383104}, "positionAbsolute": {"x": 319.441649575055, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"id": "1752575473519", "type": "custom", "data": {"tool_parameters": {"input_text": {"type": "mixed", "value": "{{#1752482022496.output#}}"}, "max_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "separator": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "subchunk_max_length": {"type": "variable", "value": ["rag", "shared", "child_max_chunk_length"]}, "subchunk_separator": {"type": "mixed", "value": "{{#rag.shared.child_delimiter#}}"}, "parent_mode": {"type": "variable", "value": ["rag", "shared", "parent_mode"]}, "remove_extra_spaces": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "remove_urls_emails": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "provider_id": "langgenius/parent_child_chunk/parent_child_chunk", "provider_type": "builtin", "provider_name": "langgenius/parent_child_chunk/parent_child_chunk", "tool_name": "parent_child_chunk", "tool_label": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "tool_description": "\u5c06\u6587\u6863\u5904\u7406\u4e3a\u7236\u5b50\u5206\u5757\u7ed3\u6784", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "Parent child chunks result", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"name": "input_text", "label": {"en_US": "Input text", "zh_Hans": "\u8f93\u5165\u6587\u672c", "pt_BR": "Input text", "ja_JP": "Input text"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "max_length", "label": {"en_US": "Maximum Length", "zh_Hans": "\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento M\u00e1ximo", "ja_JP": "Maximum Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 1024, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "Maximum length for chunking", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento m\u00e1ximo para divis\u00e3o", "ja_JP": "Maximum length for chunking"}, "form": "llm", "llm_description": "Maximum length allowed per chunk"}, {"name": "separator", "label": {"en_US": "Chunk Separator", "zh_Hans": "\u5206\u5757\u5206\u9694\u7b26", "pt_BR": "Separador de Divis\u00e3o", "ja_JP": "Chunk Separator"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": "\n\n", "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "Separator used for chunking", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u5206\u9694\u7b26", "pt_BR": "Separador usado para divis\u00e3o", "ja_JP": "Separator used for chunking"}, "form": "llm", "llm_description": "The separator used to split chunks"}, {"name": "subchunk_max_length", "label": {"en_US": "Subchunk Maximum Length", "zh_Hans": "\u5b50\u5206\u5757\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento M\u00e1ximo de Subdivis\u00e3o", "ja_JP": "Subchunk Maximum Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 512, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "Maximum length for subchunking", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento m\u00e1ximo para subdivis\u00e3o", "ja_JP": "Maximum length for subchunking"}, "form": "llm", "llm_description": "Maximum length allowed per subchunk"}, {"name": "subchunk_separator", "label": {"en_US": "Subchunk Separator", "zh_Hans": "\u5b50\u5206\u5757\u5206\u9694\u7b26", "pt_BR": "Separador de Subdivis\u00e3o", "ja_JP": "Subchunk Separator"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": ". ", "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "Separator used for subchunking", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u5206\u9694\u7b26", "pt_BR": "Separador usado para subdivis\u00e3o", "ja_JP": "Separator used for subchunking"}, "form": "llm", "llm_description": "The separator used to split subchunks"}, {"name": "parent_mode", "label": {"en_US": "Parent Mode", "zh_Hans": "\u7236\u5757\u6a21\u5f0f", "pt_BR": "Modo Pai", "ja_JP": "Parent Mode"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": "paragraph", "min": null, "max": null, "precision": null, "options": [{"value": "paragraph", "label": {"en_US": "Paragraph", "zh_Hans": "\u6bb5\u843d", "pt_BR": "Par\u00e1grafo", "ja_JP": "Paragraph"}, "icon": ""}, {"value": "full_doc", "label": {"en_US": "Full Document", "zh_Hans": "\u5168\u6587", "pt_BR": "Documento Completo", "ja_JP": "Full Document"}, "icon": ""}], "type": "select", "human_description": {"en_US": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve.", "zh_Hans": "\u6839\u636e\u5206\u9694\u7b26\u548c\u6700\u5927\u5757\u957f\u5ea6\u5c06\u6587\u672c\u62c6\u5206\u4e3a\u6bb5\u843d\uff0c\u4f7f\u7528\u62c6\u5206\u6587\u672c\u4f5c\u4e3a\u68c0\u7d22\u7684\u7236\u5757\u6216\u6574\u4e2a\u6587\u6863\u7528\u4f5c\u7236\u5757\u5e76\u76f4\u63a5\u68c0\u7d22\u3002", "pt_BR": "Dividir texto em par\u00e1grafos com base no separador e no comprimento m\u00e1ximo do bloco, usando o texto dividido como bloco pai ou documento completo como bloco pai e diretamente recuper\u00e1-lo.", "ja_JP": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve."}, "form": "llm", "llm_description": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve."}, {"name": "remove_extra_spaces", "label": {"en_US": "Remove Extra Spaces", "zh_Hans": "\u79fb\u9664\u591a\u4f59\u7a7a\u683c", "pt_BR": "Remover Espa\u00e7os Extras", "ja_JP": "Remove Extra Spaces"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 0, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Whether to remove extra spaces in the text", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684\u591a\u4f59\u7a7a\u683c", "pt_BR": "Se deve remover espa\u00e7os extras no texto", "ja_JP": "Whether to remove extra spaces in the text"}, "form": "llm", "llm_description": "Whether to remove extra spaces in the text"}, {"name": "remove_urls_emails", "label": {"en_US": "Remove URLs and Emails", "zh_Hans": "\u79fb\u9664URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Remover URLs e E-mails", "ja_JP": "Remove URLs and Emails"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 0, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Whether to remove URLs and emails in the text", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Se deve remover URLs e e-mails no texto", "ja_JP": "Whether to remove URLs and emails in the text"}, "form": "llm", "llm_description": "Whether to remove URLs and emails in the text"}], "params": {"input_text": "", "max_length": "", "separator": "", "subchunk_max_length": "", "subchunk_separator": "", "parent_mode": "", "remove_extra_spaces": "", "remove_urls_emails": ""}, "selected": false}, "position": {"x": 637.9241611063885, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 637.9241611063885, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": true}], "edges": [{"data": {"isInIteration": false, "isInLoop": false, "sourceType": "datasource", "targetType": "if-else"}, "id": "1752479895761-source-1752481129417-target", "source": "1752479895761", "sourceHandle": "source", "target": "1752481129417", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInLoop": false, "sourceType": "if-else", "targetType": "tool"}, "id": "1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target", "source": "1752481129417", "sourceHandle": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "target": "1752480460682", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInLoop": false, "sourceType": "if-else", "targetType": "document-extractor"}, "id": "1752481129417-false-1752481112180-target", "source": "1752481129417", "sourceHandle": "false", "target": "1752481112180", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInIteration": false, "isInLoop": false, "sourceType": "tool", "targetType": "variable-aggregator"}, "id": "1752480460682-source-1752482022496-target", "source": "1752480460682", "sourceHandle": "source", "target": "1752482022496", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInLoop": false, "sourceType": "document-extractor", "targetType": "variable-aggregator"}, "id": "1752481112180-source-1752482022496-target", "source": "1752481112180", "sourceHandle": "source", "target": "1752482022496", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"id": "1752482022496-source-1752575473519-target", "type": "custom", "source": "1752482022496", "sourceHandle": "source", "target": "1752575473519", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752575473519-source-1752477924228-target", "type": "custom", "source": "1752575473519", "target": "1752477924228", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInLoop": false}, "zIndex": 0}], "viewport": {"x": 948.6766333808323, "y": -102.06757184183238, "zoom": 0.8375774577380971}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/notion-general-economy.yml b/api/services/rag_pipeline/transform/notion-general-economy.yml new file mode 100644 index 0000000000..3f103e7e2b --- /dev/null +++ b/api/services/rag_pipeline/transform/notion-general-economy.yml @@ -0,0 +1,394 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '' + icon_type: emoji + name: notion-general-economy +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752482151668-source-1752477924228-target + source: '1752482151668' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: tool + id: 1752489759475-source-1752482151668-target + source: '1752489759475' + sourceHandle: source + target: '1752482151668' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752482151668' + - result + indexing_technique: economy + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: keyword_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: true + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 1444.5503479271906 + y: 281.3910724383104 + positionAbsolute: + x: 1444.5503479271906 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752489759475.content#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752482151668' + position: + x: 1063.6922916384628 + y: 281.3910724383104 + positionAbsolute: + x: 1063.6922916384628 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Notion数据源 + datasource_name: notion_datasource + datasource_parameters: {} + plugin_id: langgenius/notion_datasource + provider_name: notion + provider_type: online_document + selected: false + title: Notion数据源 + type: datasource + height: 52 + id: '1752489759475' + position: + x: 736.9082104000458 + y: 281.3910724383104 + positionAbsolute: + x: 736.9082104000458 + y: 281.3910724383104 + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -838.569649323166 + y: -168.94656489167426 + zoom: 1.286925643857699 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: null + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Chunk overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: number + unit: null + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/notion-general-high-quality.yml b/api/services/rag_pipeline/transform/notion-general-high-quality.yml new file mode 100644 index 0000000000..371623a7fe --- /dev/null +++ b/api/services/rag_pipeline/transform/notion-general-high-quality.yml @@ -0,0 +1,394 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '#FFF4ED' + icon_type: emoji + name: notion-general-high-quality +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752482151668-source-1752477924228-target + source: '1752482151668' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: tool + id: 1752489759475-source-1752482151668-target + source: '1752489759475' + sourceHandle: source + target: '1752482151668' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752482151668' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: true + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 1444.5503479271906 + y: 281.3910724383104 + positionAbsolute: + x: 1444.5503479271906 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752489759475.content#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752482151668' + position: + x: 1063.6922916384628 + y: 281.3910724383104 + positionAbsolute: + x: 1063.6922916384628 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Notion数据源 + datasource_name: notion_datasource + datasource_parameters: {} + plugin_id: langgenius/notion_datasource + provider_name: notion + provider_type: online_document + selected: false + title: Notion数据源 + type: datasource + height: 52 + id: '1752489759475' + position: + x: 736.9082104000458 + y: 281.3910724383104 + positionAbsolute: + x: 736.9082104000458 + y: 281.3910724383104 + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -838.569649323166 + y: -168.94656489167426 + zoom: 1.286925643857699 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: null + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Chunk overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: number + unit: null + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/notion-parentchild.yml b/api/services/rag_pipeline/transform/notion-parentchild.yml new file mode 100644 index 0000000000..b793c621d6 --- /dev/null +++ b/api/services/rag_pipeline/transform/notion-parentchild.yml @@ -0,0 +1,503 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '' + icon_type: emoji + name: notion-parentchild +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: tool + id: 1752489759475-source-1752490343805-target + source: '1752489759475' + sourceHandle: source + target: '1752490343805' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752490343805-source-1752477924228-target + source: '1752490343805' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: hierarchical_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752490343805' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: false + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 1486.2052698032674 + y: 281.3910724383104 + positionAbsolute: + x: 1486.2052698032674 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Notion数据源 + datasource_name: notion_datasource + datasource_parameters: {} + plugin_id: langgenius/notion_datasource + provider_name: notion + provider_type: online_document + selected: false + title: Notion数据源 + type: datasource + height: 52 + id: '1752489759475' + position: + x: 736.9082104000458 + y: 281.3910724383104 + positionAbsolute: + x: 736.9082104000458 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: Parent child chunks result + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input text + ja_JP: Input text + pt_BR: Input text + zh_Hans: 输入文本 + llm_description: The text you want to chunk. + max: null + min: null + name: input_text + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: 1024 + form: llm + human_description: + en_US: Maximum length for chunking + ja_JP: Maximum length for chunking + pt_BR: Comprimento máximo para divisão + zh_Hans: 用于分块的最大长度 + label: + en_US: Maximum Length + ja_JP: Maximum Length + pt_BR: Comprimento Máximo + zh_Hans: 最大长度 + llm_description: Maximum length allowed per chunk + max: null + min: null + name: max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: ' + + + ' + form: llm + human_description: + en_US: Separator used for chunking + ja_JP: Separator used for chunking + pt_BR: Separador usado para divisão + zh_Hans: 用于分块的分隔符 + label: + en_US: Chunk Separator + ja_JP: Chunk Separator + pt_BR: Separador de Divisão + zh_Hans: 分块分隔符 + llm_description: The separator used to split chunks + max: null + min: null + name: separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: 512 + form: llm + human_description: + en_US: Maximum length for subchunking + ja_JP: Maximum length for subchunking + pt_BR: Comprimento máximo para subdivisão + zh_Hans: 用于子分块的最大长度 + label: + en_US: Subchunk Maximum Length + ja_JP: Subchunk Maximum Length + pt_BR: Comprimento Máximo de Subdivisão + zh_Hans: 子分块最大长度 + llm_description: Maximum length allowed per subchunk + max: null + min: null + name: subchunk_max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: '. ' + form: llm + human_description: + en_US: Separator used for subchunking + ja_JP: Separator used for subchunking + pt_BR: Separador usado para subdivisão + zh_Hans: 用于子分块的分隔符 + label: + en_US: Subchunk Separator + ja_JP: Subchunk Separator + pt_BR: Separador de Subdivisão + zh_Hans: 子分块分隔符 + llm_description: The separator used to split subchunks + max: null + min: null + name: subchunk_separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: paragraph + form: llm + human_description: + en_US: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + ja_JP: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + pt_BR: Dividir texto em parágrafos com base no separador e no comprimento + máximo do bloco, usando o texto dividido como bloco pai ou documento + completo como bloco pai e diretamente recuperá-lo. + zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。 + label: + en_US: Parent Mode + ja_JP: Parent Mode + pt_BR: Modo Pai + zh_Hans: 父块模式 + llm_description: Split text into paragraphs based on separator and maximum + chunk length, using split text as parent block or entire document as parent + block and directly retrieve. + max: null + min: null + name: parent_mode + options: + - icon: '' + label: + en_US: Paragraph + ja_JP: Paragraph + pt_BR: Parágrafo + zh_Hans: 段落 + value: paragraph + - icon: '' + label: + en_US: Full Document + ja_JP: Full Document + pt_BR: Documento Completo + zh_Hans: 全文 + value: full_doc + placeholder: null + precision: null + required: true + scope: null + template: null + type: select + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove extra spaces in the text + ja_JP: Whether to remove extra spaces in the text + pt_BR: Se deve remover espaços extras no texto + zh_Hans: 是否移除文本中的多余空格 + label: + en_US: Remove Extra Spaces + ja_JP: Remove Extra Spaces + pt_BR: Remover Espaços Extras + zh_Hans: 移除多余空格 + llm_description: Whether to remove extra spaces in the text + max: null + min: null + name: remove_extra_spaces + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove URLs and emails in the text + ja_JP: Whether to remove URLs and emails in the text + pt_BR: Se deve remover URLs e e-mails no texto + zh_Hans: 是否移除文本中的URL和电子邮件地址 + label: + en_US: Remove URLs and Emails + ja_JP: Remove URLs and Emails + pt_BR: Remover URLs e E-mails + zh_Hans: 移除URL和电子邮件地址 + llm_description: Whether to remove URLs and emails in the text + max: null + min: null + name: remove_urls_emails + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + input_text: '' + max_length: '' + parent_mode: '' + remove_extra_spaces: '' + remove_urls_emails: '' + separator: '' + subchunk_max_length: '' + subchunk_separator: '' + provider_id: langgenius/parent_child_chunk/parent_child_chunk + provider_name: langgenius/parent_child_chunk/parent_child_chunk + provider_type: builtin + selected: true + title: 父子分块处理器 + tool_configurations: {} + tool_description: 将文档处理为父子分块结构 + tool_label: 父子分块处理器 + tool_name: parent_child_chunk + tool_parameters: + input_text: + type: mixed + value: '{{#1752489759475.content#}}' + max_length: + type: variable + value: + - rag + - shared + - max_chunk_length + parent_mode: + type: variable + value: + - rag + - shared + - parent_mode + remove_extra_spaces: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + remove_urls_emails: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + separator: + type: mixed + value: '{{#rag.shared.delimiter#}}' + subchunk_max_length: + type: variable + value: + - rag + - shared + - child_max_chunk_length + subchunk_separator: + type: mixed + value: '{{#rag.shared.child_delimiter#}}' + type: tool + height: 52 + id: '1752490343805' + position: + x: 1077.0240183162543 + y: 281.3910724383104 + positionAbsolute: + x: 1077.0240183162543 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -487.2912544090391 + y: -54.7029301848807 + zoom: 0.9994011715768695 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n\n + label: Delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 1024 + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n + label: Child delimiter + max_length: 199 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: child_delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 512 + label: Child max chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: child_max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: paragraph + label: Parent mode + max_length: 48 + options: + - full_doc + - paragraph + placeholder: null + required: true + tooltips: null + type: select + unit: null + variable: parent_mode + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/notion_general_economy.json b/api/services/rag_pipeline/transform/notion_general_economy.json new file mode 100644 index 0000000000..ed5071fc36 --- /dev/null +++ b/api/services/rag_pipeline/transform/notion_general_economy.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "text_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752482151668", "result"], "indexing_technique": "economy", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "keyword_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": false, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 1444.5503479271906, "y": 281.3910724383104}, "positionAbsolute": {"x": 1444.5503479271906, "y": 281.3910724383104}, "selected": true, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "The text you want to chunk.", "ja_JP": "The text you want to chunk.", "pt_BR": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002"}, "label": {"en_US": "Input Variable", "ja_JP": "Input Variable", "pt_BR": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf"}, "llm_description": "The text you want to chunk.", "max": null, "min": null, "name": "input_variable", "options": [], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "string"}, {"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks.", "pt_BR": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002"}, "label": {"en_US": "Delimiter", "ja_JP": "Delimiter", "pt_BR": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26"}, "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string.", "max": null, "min": null, "name": "delimiter", "options": [], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "string"}, {"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "The maximum chunk length.", "ja_JP": "The maximum chunk length.", "pt_BR": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002"}, "label": {"en_US": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length", "pt_BR": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6"}, "llm_description": "The maximum chunk length, the format of the chunk size must be an integer.", "max": null, "min": null, "name": "max_chunk_length", "options": [], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "number"}, {"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "The chunk overlap length.", "ja_JP": "The chunk overlap length.", "pt_BR": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002"}, "label": {"en_US": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length", "pt_BR": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6"}, "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer.", "max": null, "min": null, "name": "chunk_overlap_length", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "number"}, {"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs", "pt_BR": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26"}, "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26"}, "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean.", "max": null, "min": null, "name": "replace_consecutive_spaces_newlines_tabs", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "boolean"}, {"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses", "pt_BR": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740"}, "label": {"en_US": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses", "pt_BR": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740"}, "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean.", "max": null, "min": null, "name": "delete_all_urls_and_email_addresses", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "boolean"}], "params": {"chunk_overlap_length": "", "delete_all_urls_and_email_addresses": "", "delimiter": "", "input_variable": "", "max_chunk_length": "", "replace_consecutive_spaces_newlines_tabs": ""}, "provider_id": "langgenius/general_chunk/general_chunk", "provider_name": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "selected": false, "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_configurations": {}, "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_name": "general_chunk", "tool_parameters": {"chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "input_variable": {"type": "mixed", "value": "{{#1752489759475.content#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "type": "tool"}, "height": 52, "id": "1752482151668", "position": {"x": 1063.6922916384628, "y": 281.3910724383104}, "positionAbsolute": {"x": 1063.6922916384628, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"id": "1752489759475", "type": "custom", "data": {"datasource_parameters": {}, "datasource_configurations": {}, "type": "datasource", "title": "Notion\u6570\u636e\u6e90", "plugin_id": "langgenius/notion_datasource", "provider_type": "online_document", "provider_name": "notion", "datasource_name": "notion_datasource", "datasource_label": "Notion\u6570\u636e\u6e90", "selected": false}, "position": {"x": 736.9082104000458, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 736.9082104000458, "y": 281.3910724383104}, "width": 242, "height": 52}], "edges": [{"data": {"isInIteration": false, "isInLoop": false, "sourceType": "tool", "targetType": "knowledge-index"}, "id": "1752482151668-source-1752477924228-target", "source": "1752482151668", "sourceHandle": "source", "target": "1752477924228", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"id": "1752489759475-source-1752482151668-target", "type": "custom", "source": "1752489759475", "sourceHandle": "source", "target": "1752482151668", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}], "viewport": {"x": -838.5696493231662, "y": -168.94656489167426, "zoom": 1.286925643857699}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/notion_general_high_quality.json b/api/services/rag_pipeline/transform/notion_general_high_quality.json new file mode 100644 index 0000000000..d62674a0bb --- /dev/null +++ b/api/services/rag_pipeline/transform/notion_general_high_quality.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "hierarchical_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752490343805", "result"], "indexing_technique": "high_quality", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "semantic_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": false, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 1486.2052698032674, "y": 281.3910724383104}, "positionAbsolute": {"x": 1486.2052698032674, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "Notion\u6570\u636e\u6e90", "datasource_name": "notion_datasource", "datasource_parameters": {}, "plugin_id": "langgenius/notion_datasource", "provider_name": "notion", "provider_type": "online_document", "selected": false, "title": "Notion\u6570\u636e\u6e90", "type": "datasource"}, "height": 52, "id": "1752489759475", "position": {"x": 736.9082104000458, "y": 281.3910724383104}, "positionAbsolute": {"x": 736.9082104000458, "y": 281.3910724383104}, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242, "selected": false}, {"id": "1752490343805", "type": "custom", "data": {"tool_parameters": {"input_text": {"type": "mixed", "value": "{{#1752489759475.content#}}"}, "max_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "separator": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "subchunk_max_length": {"type": "variable", "value": ["rag", "shared", "child_max_chunk_length"]}, "subchunk_separator": {"type": "mixed", "value": "{{#rag.shared.child_delimiter#}}"}, "parent_mode": {"type": "variable", "value": ["rag", "shared", "parent_mode"]}, "remove_extra_spaces": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "remove_urls_emails": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "provider_id": "langgenius/parent_child_chunk/parent_child_chunk", "provider_type": "builtin", "provider_name": "langgenius/parent_child_chunk/parent_child_chunk", "tool_name": "parent_child_chunk", "tool_label": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "tool_description": "\u5c06\u6587\u6863\u5904\u7406\u4e3a\u7236\u5b50\u5206\u5757\u7ed3\u6784", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "Parent child chunks result", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"name": "input_text", "label": {"en_US": "Input text", "zh_Hans": "\u8f93\u5165\u6587\u672c", "pt_BR": "Input text", "ja_JP": "Input text"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "max_length", "label": {"en_US": "Maximum Length", "zh_Hans": "\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento M\u00e1ximo", "ja_JP": "Maximum Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 1024, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "Maximum length for chunking", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento m\u00e1ximo para divis\u00e3o", "ja_JP": "Maximum length for chunking"}, "form": "llm", "llm_description": "Maximum length allowed per chunk"}, {"name": "separator", "label": {"en_US": "Chunk Separator", "zh_Hans": "\u5206\u5757\u5206\u9694\u7b26", "pt_BR": "Separador de Divis\u00e3o", "ja_JP": "Chunk Separator"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": "\n\n", "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "Separator used for chunking", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u5206\u9694\u7b26", "pt_BR": "Separador usado para divis\u00e3o", "ja_JP": "Separator used for chunking"}, "form": "llm", "llm_description": "The separator used to split chunks"}, {"name": "subchunk_max_length", "label": {"en_US": "Subchunk Maximum Length", "zh_Hans": "\u5b50\u5206\u5757\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento M\u00e1ximo de Subdivis\u00e3o", "ja_JP": "Subchunk Maximum Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 512, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "Maximum length for subchunking", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6", "pt_BR": "Comprimento m\u00e1ximo para subdivis\u00e3o", "ja_JP": "Maximum length for subchunking"}, "form": "llm", "llm_description": "Maximum length allowed per subchunk"}, {"name": "subchunk_separator", "label": {"en_US": "Subchunk Separator", "zh_Hans": "\u5b50\u5206\u5757\u5206\u9694\u7b26", "pt_BR": "Separador de Subdivis\u00e3o", "ja_JP": "Subchunk Separator"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": ". ", "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "Separator used for subchunking", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u5206\u9694\u7b26", "pt_BR": "Separador usado para subdivis\u00e3o", "ja_JP": "Separator used for subchunking"}, "form": "llm", "llm_description": "The separator used to split subchunks"}, {"name": "parent_mode", "label": {"en_US": "Parent Mode", "zh_Hans": "\u7236\u5757\u6a21\u5f0f", "pt_BR": "Modo Pai", "ja_JP": "Parent Mode"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": "paragraph", "min": null, "max": null, "precision": null, "options": [{"value": "paragraph", "label": {"en_US": "Paragraph", "zh_Hans": "\u6bb5\u843d", "pt_BR": "Par\u00e1grafo", "ja_JP": "Paragraph"}, "icon": ""}, {"value": "full_doc", "label": {"en_US": "Full Document", "zh_Hans": "\u5168\u6587", "pt_BR": "Documento Completo", "ja_JP": "Full Document"}, "icon": ""}], "type": "select", "human_description": {"en_US": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve.", "zh_Hans": "\u6839\u636e\u5206\u9694\u7b26\u548c\u6700\u5927\u5757\u957f\u5ea6\u5c06\u6587\u672c\u62c6\u5206\u4e3a\u6bb5\u843d\uff0c\u4f7f\u7528\u62c6\u5206\u6587\u672c\u4f5c\u4e3a\u68c0\u7d22\u7684\u7236\u5757\u6216\u6574\u4e2a\u6587\u6863\u7528\u4f5c\u7236\u5757\u5e76\u76f4\u63a5\u68c0\u7d22\u3002", "pt_BR": "Dividir texto em par\u00e1grafos com base no separador e no comprimento m\u00e1ximo do bloco, usando o texto dividido como bloco pai ou documento completo como bloco pai e diretamente recuper\u00e1-lo.", "ja_JP": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve."}, "form": "llm", "llm_description": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve."}, {"name": "remove_extra_spaces", "label": {"en_US": "Remove Extra Spaces", "zh_Hans": "\u79fb\u9664\u591a\u4f59\u7a7a\u683c", "pt_BR": "Remover Espa\u00e7os Extras", "ja_JP": "Remove Extra Spaces"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 0, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Whether to remove extra spaces in the text", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684\u591a\u4f59\u7a7a\u683c", "pt_BR": "Se deve remover espa\u00e7os extras no texto", "ja_JP": "Whether to remove extra spaces in the text"}, "form": "llm", "llm_description": "Whether to remove extra spaces in the text"}, {"name": "remove_urls_emails", "label": {"en_US": "Remove URLs and Emails", "zh_Hans": "\u79fb\u9664URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Remover URLs e E-mails", "ja_JP": "Remove URLs and Emails"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": 0, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Whether to remove URLs and emails in the text", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Se deve remover URLs e e-mails no texto", "ja_JP": "Whether to remove URLs and emails in the text"}, "form": "llm", "llm_description": "Whether to remove URLs and emails in the text"}], "params": {"input_text": "", "max_length": "", "separator": "", "subchunk_max_length": "", "subchunk_separator": "", "parent_mode": "", "remove_extra_spaces": "", "remove_urls_emails": ""}, "selected": true}, "position": {"x": 1077.0240183162543, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1077.0240183162543, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": true}], "edges": [{"id": "1752489759475-source-1752490343805-target", "type": "custom", "source": "1752489759475", "sourceHandle": "source", "target": "1752490343805", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752490343805-source-1752477924228-target", "type": "custom", "source": "1752490343805", "target": "1752477924228", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInLoop": false}, "zIndex": 0}], "viewport": {"x": -487.2912544090391, "y": -54.7029301848807, "zoom": 0.9994011715768695}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/notion_parent_child.json b/api/services/rag_pipeline/transform/notion_parent_child.json new file mode 100644 index 0000000000..82ac85ff41 --- /dev/null +++ b/api/services/rag_pipeline/transform/notion_parent_child.json @@ -0,0 +1 @@ +{"nodes": [{"id": "1752477924228", "type": "custom", "data": {"index_chunk_variable_selector": ["1752482151668", "result"], "keyword_number": 10, "retrieval_model": {"top_k": 3, "score_threshold_enabled": false, "score_threshold": 0.5, "search_method": "semantic_search", "vector_setting": {"embedding_provider_name": "langgenius/openai/openai", "embedding_model_name": "text-embedding-ada-002"}}, "type": "knowledge-index", "title": "\u77e5\u8bc6\u5e93", "selected": false, "chunk_structure": "text_model", "indexing_technique": "high_quality", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai"}, "position": {"x": 1076.4656678451215, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1076.4656678451215, "y": 281.3910724383104}, "width": 242, "height": 114, "selected": false}, {"id": "1752479895761", "type": "custom", "data": {"datasource_parameters": {}, "datasource_configurations": {}, "type": "datasource", "title": "File", "plugin_id": "langgenius/file", "provider_type": "local_file", "provider_name": "file", "datasource_name": "upload-file", "datasource_label": "File", "selected": false, "fileExtensions": ["txt", "markdown", "mdx", "pdf", "html", "xlsx", "xls", "vtt", "properties", "doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub", "ppt", "md"]}, "position": {"x": -839.8603427660498, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -839.8603427660498, "y": 251.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752480460682", "type": "custom", "data": {"tool_parameters": {"file": {"type": "variable", "value": ["1752479895761", "file"]}}, "tool_configurations": {}, "type": "tool", "title": "Dify\u6587\u672c\u63d0\u53d6\u5668", "provider_id": "langgenius/dify_extractor/dify_extractor", "provider_type": "builtin", "provider_name": "langgenius/dify_extractor/dify_extractor", "tool_name": "dify_extractor", "tool_label": "Dify\u6587\u672c\u63d0\u53d6\u5668", "tool_description": "Dify Extractor", "is_team_authorization": true, "output_schema": {"properties": {"documents": {"description": "the documents extracted from the file", "items": {"type": "object"}, "type": "array"}, "images": {"description": "The images extracted from the file", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"name": "file", "label": {"en_US": "file", "zh_Hans": "file", "pt_BR": "file", "ja_JP": "file"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "file", "human_description": {"en_US": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "zh_Hans": "\u7528\u4e8e\u89e3\u6790\u7684\u6587\u4ef6(\u652f\u6301 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "pt_BR": "o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png, jpg, jpeg)", "ja_JP": "the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}, "form": "llm", "llm_description": "the file to be parsed (support pdf, ppt, pptx, doc, docx, png, jpg, jpeg)"}], "params": {"file": ""}, "selected": false}, "position": {"x": -108.28652292656551, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752481112180", "type": "custom", "data": {"variable_selector": ["1752479895761", "file"], "is_array_file": false, "type": "document-extractor", "title": "\u6587\u6863\u63d0\u53d6\u5668", "selected": false}, "position": {"x": -108.28652292656551, "y": 390.6576481692478}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -108.28652292656551, "y": 390.6576481692478}, "width": 242, "height": 90, "selected": false}, {"id": "1752481129417", "type": "custom", "data": {"cases": [{"id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "case_id": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "logical_operator": "or", "conditions": [{"id": "9da88d93-3ff6-463f-abfd-6bcafbf2554d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xlsx"}, {"id": "d0e88f5e-dfe3-4bae-af0c-dbec267500de", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".xls"}, {"id": "a957e91e-1ed7-4c6b-9c80-2f0948858f1d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".md"}, {"id": "870c3c39-8d3f-474a-ab8b-9c0ccf53db73", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".markdown"}, {"id": "f9541513-1e71-4dc1-9db5-35dc84a39e3c", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".mdx"}, {"id": "4c7f455b-ac20-40ca-9495-6cc44ffcb35d", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".html"}, {"id": "2e12d9c7-8057-4a09-8851-f9fd1d0718d1", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".htm"}, {"id": "73a995a9-d8b9-4aef-89f7-306e2ddcbce2", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".docx"}, {"id": "8a2e8772-0426-458b-a1f9-9eaaec0f27c8", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".csv"}, {"id": "aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602", "varType": "file", "variable_selector": ["1752479895761", "file", "extension"], "comparison_operator": "is", "value": ".txt"}]}], "type": "if-else", "title": "\u6761\u4ef6\u5206\u652f", "selected": false}, "position": {"x": -489.57009543377865, "y": 251.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": -489.57009543377865, "y": 251.3910724383104}, "width": 242, "height": 358, "selected": true}, {"id": "1752482022496", "type": "custom", "data": {"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "type": "variable-aggregator", "title": "\u53d8\u91cf\u805a\u5408\u5668", "selected": false, "advanced_settings": {"group_enabled": false, "groups": [{"output_type": "string", "variables": [["1752481112180", "text"], ["1752480460682", "text"]], "group_name": "Group1", "groupId": "f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7"}]}}, "position": {"x": 319.441649575055, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 319.441649575055, "y": 281.3910724383104}, "width": 242, "height": 129, "selected": false}, {"id": "1752482151668", "type": "custom", "data": {"tool_parameters": {"input_variable": {"type": "mixed", "value": "{{#1752482022496.output#}}"}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "provider_id": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "provider_name": "langgenius/general_chunk/general_chunk", "tool_name": "general_chunk", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"name": "input_variable", "label": {"en_US": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf", "pt_BR": "Input Variable", "ja_JP": "Input Variable"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "delimiter", "label": {"en_US": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26", "pt_BR": "Delimiter", "ja_JP": "Delimiter"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002", "pt_BR": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks."}, "form": "llm", "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string."}, {"name": "max_chunk_length", "label": {"en_US": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6", "pt_BR": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002", "pt_BR": "The maximum chunk length.", "ja_JP": "The maximum chunk length."}, "form": "llm", "llm_description": "The maximum chunk length, the format of the chunk size must be an integer."}, {"name": "chunk_overlap_length", "label": {"en_US": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6", "pt_BR": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002", "pt_BR": "The chunk overlap length.", "ja_JP": "The chunk overlap length."}, "form": "llm", "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer."}, {"name": "replace_consecutive_spaces_newlines_tabs", "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs"}, "form": "llm", "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean."}, {"name": "delete_all_urls_and_email_addresses", "label": {"en_US": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses"}, "form": "llm", "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean."}], "params": {"input_variable": "", "delimiter": "", "max_chunk_length": "", "chunk_overlap_length": "", "replace_consecutive_spaces_newlines_tabs": "", "delete_all_urls_and_email_addresses": ""}, "selected": false}, "position": {"x": 693.5300771507484, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 693.5300771507484, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}], "edges": [{"id": "1752479895761-source-1752481129417-target", "type": "custom", "source": "1752479895761", "sourceHandle": "source", "target": "1752481129417", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "if-else", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target", "type": "custom", "source": "1752481129417", "target": "1752480460682", "sourceHandle": "24e47cad-f1e2-4f74-9884-3f49d5bb37b7", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "tool", "isInLoop": false}, "zIndex": 0}, {"id": "1752481129417-false-1752481112180-target", "type": "custom", "source": "1752481129417", "target": "1752481112180", "sourceHandle": "false", "targetHandle": "target", "data": {"sourceType": "if-else", "targetType": "document-extractor", "isInLoop": false}, "zIndex": 0}, {"id": "1752480460682-source-1752482022496-target", "type": "custom", "source": "1752480460682", "sourceHandle": "source", "target": "1752482022496", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "variable-aggregator", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752481112180-source-1752482022496-target", "type": "custom", "source": "1752481112180", "target": "1752482022496", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "document-extractor", "targetType": "variable-aggregator", "isInLoop": false}, "zIndex": 0}, {"id": "1752482022496-source-1752482151668-target", "type": "custom", "source": "1752482022496", "sourceHandle": "source", "target": "1752482151668", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752482151668-source-1752477924228-target", "type": "custom", "source": "1752482151668", "sourceHandle": "source", "target": "1752477924228", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInIteration": false, "isInLoop": false}, "zIndex": 0}], "viewport": {"x": 701.4999626224237, "y": 128.33739021504016, "zoom": 0.48941689643726966}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/web_crawl_general_economy.json b/api/services/rag_pipeline/transform/web_crawl_general_economy.json new file mode 100644 index 0000000000..7fa9ae95bd --- /dev/null +++ b/api/services/rag_pipeline/transform/web_crawl_general_economy.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "text_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752569675978", "result"], "indexing_technique": "economy", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "keyword_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": false, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 2140.4053851189346, "y": 281.3910724383104}, "positionAbsolute": {"x": 2140.4053851189346, "y": 281.3910724383104}, "selected": true, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "Jina Reader", "datasource_name": "jina_reader", "datasource_parameters": {"crawl_sub_pages": {"type": "mixed", "value": "{{#rag.1752491761974.jina_crawl_sub_pages#}}"}, "limit": {"type": "variable", "value": ["rag", "1752491761974", "jina_limit"]}, "url": {"type": "mixed", "value": "{{#rag.1752491761974.jina_url#}}"}, "use_sitemap": {"type": "mixed", "value": "{{#rag.1752491761974.jina_use_sitemap#}}"}}, "plugin_id": "langgenius/jina_datasource", "provider_name": "jina", "provider_type": "website_crawl", "selected": false, "title": "Jina Reader", "type": "datasource"}, "height": 52, "id": "1752491761974", "position": {"x": 1067.7526055798794, "y": 281.3910724383104}, "positionAbsolute": {"x": 1067.7526055798794, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "Firecrawl", "datasource_name": "crawl", "datasource_parameters": {"crawl_subpages": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}"}, "exclude_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_exclude_paths#}}"}, "include_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_include_only_paths#}}"}, "limit": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_limit"]}, "max_depth": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_max_depth"]}, "only_main_content": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_extract_main_content#}}"}, "url": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_url#}}"}}, "plugin_id": "langgenius/firecrawl_datasource", "provider_name": "firecrawl", "provider_type": "website_crawl", "selected": false, "title": "Firecrawl", "type": "datasource"}, "height": 52, "id": "1752565402678", "position": {"x": 1067.7526055798794, "y": 417.32608398342404}, "positionAbsolute": {"x": 1067.7526055798794, "y": 417.32608398342404}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"output_type": "string", "selected": false, "title": "\u53d8\u91cf\u805a\u5408\u5668", "type": "variable-aggregator", "variables": [["1752491761974", "content"], ["1752565402678", "content"]]}, "height": 129, "id": "1752565435219", "position": {"x": 1505.4306671642219, "y": 281.3910724383104}, "positionAbsolute": {"x": 1505.4306671642219, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"id": "1752569675978", "type": "custom", "data": {"tool_parameters": {"input_variable": {"type": "mixed", "value": "{{#1752565435219.output#}}"}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "provider_id": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "provider_name": "langgenius/general_chunk/general_chunk", "tool_name": "general_chunk", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"name": "input_variable", "label": {"en_US": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf", "pt_BR": "Input Variable", "ja_JP": "Input Variable"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "delimiter", "label": {"en_US": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26", "pt_BR": "Delimiter", "ja_JP": "Delimiter"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002", "pt_BR": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks."}, "form": "llm", "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string."}, {"name": "max_chunk_length", "label": {"en_US": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6", "pt_BR": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002", "pt_BR": "The maximum chunk length.", "ja_JP": "The maximum chunk length."}, "form": "llm", "llm_description": "The maximum chunk length, the format of the chunk size must be an integer."}, {"name": "chunk_overlap_length", "label": {"en_US": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6", "pt_BR": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002", "pt_BR": "The chunk overlap length.", "ja_JP": "The chunk overlap length."}, "form": "llm", "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer."}, {"name": "replace_consecutive_spaces_newlines_tabs", "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs"}, "form": "llm", "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean."}, {"name": "delete_all_urls_and_email_addresses", "label": {"en_US": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses"}, "form": "llm", "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean."}], "params": {"input_variable": "", "delimiter": "", "max_chunk_length": "", "chunk_overlap_length": "", "replace_consecutive_spaces_newlines_tabs": "", "delete_all_urls_and_email_addresses": ""}, "selected": false}, "position": {"x": 1807.4306671642219, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1807.4306671642219, "y": 281.3910724383104}, "width": 242, "height": 52}], "edges": [{"data": {"isInIteration": false, "isInLoop": false, "sourceType": "datasource", "targetType": "variable-aggregator"}, "id": "1752491761974-source-1752565435219-target", "source": "1752491761974", "sourceHandle": "source", "target": "1752565435219", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInLoop": false, "sourceType": "datasource", "targetType": "variable-aggregator"}, "id": "1752565402678-source-1752565435219-target", "source": "1752565402678", "sourceHandle": "source", "target": "1752565435219", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"id": "1752565435219-source-1752569675978-target", "type": "custom", "source": "1752565435219", "sourceHandle": "source", "target": "1752569675978", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752569675978-source-1752477924228-target", "type": "custom", "source": "1752569675978", "target": "1752477924228", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInLoop": false}, "zIndex": 0}], "viewport": {"x": -707.721097109337, "y": -93.07807382100896, "zoom": 0.9350632198875476}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/web_crawl_general_high_quality.json b/api/services/rag_pipeline/transform/web_crawl_general_high_quality.json new file mode 100644 index 0000000000..25dfad4873 --- /dev/null +++ b/api/services/rag_pipeline/transform/web_crawl_general_high_quality.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "text_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752569675978", "result"], "indexing_technique": "high_quality", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "semantic_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": true, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 2140.4053851189346, "y": 281.3910724383104}, "positionAbsolute": {"x": 2140.4053851189346, "y": 281.3910724383104}, "selected": true, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "Jina Reader", "datasource_name": "jina_reader", "datasource_parameters": {"crawl_sub_pages": {"type": "mixed", "value": "{{#rag.1752491761974.jina_crawl_sub_pages#}}"}, "limit": {"type": "variable", "value": ["rag", "1752491761974", "jina_limit"]}, "url": {"type": "mixed", "value": "{{#rag.1752491761974.jina_url#}}"}, "use_sitemap": {"type": "mixed", "value": "{{#rag.1752491761974.jina_use_sitemap#}}"}}, "plugin_id": "langgenius/jina_datasource", "provider_name": "jina", "provider_type": "website_crawl", "selected": false, "title": "Jina Reader", "type": "datasource"}, "height": 52, "id": "1752491761974", "position": {"x": 1067.7526055798794, "y": 281.3910724383104}, "positionAbsolute": {"x": 1067.7526055798794, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"datasource_configurations": {}, "datasource_label": "Firecrawl", "datasource_name": "crawl", "datasource_parameters": {"crawl_subpages": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}"}, "exclude_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_exclude_paths#}}"}, "include_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_include_only_paths#}}"}, "limit": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_limit"]}, "max_depth": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_max_depth"]}, "only_main_content": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_extract_main_content#}}"}, "url": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_url#}}"}}, "plugin_id": "langgenius/firecrawl_datasource", "provider_name": "firecrawl", "provider_type": "website_crawl", "selected": false, "title": "Firecrawl", "type": "datasource"}, "height": 52, "id": "1752565402678", "position": {"x": 1067.7526055798794, "y": 417.32608398342404}, "positionAbsolute": {"x": 1067.7526055798794, "y": 417.32608398342404}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"output_type": "string", "selected": false, "title": "\u53d8\u91cf\u805a\u5408\u5668", "type": "variable-aggregator", "variables": [["1752491761974", "content"], ["1752565402678", "content"]]}, "height": 129, "id": "1752565435219", "position": {"x": 1505.4306671642219, "y": 281.3910724383104}, "positionAbsolute": {"x": 1505.4306671642219, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"id": "1752569675978", "type": "custom", "data": {"tool_parameters": {"input_variable": {"type": "mixed", "value": "{{#1752565435219.output#}}"}, "delimiter": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "max_chunk_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "chunk_overlap_length": {"type": "variable", "value": ["rag", "shared", "chunk_overlap"]}, "replace_consecutive_spaces_newlines_tabs": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "delete_all_urls_and_email_addresses": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "tool_configurations": {}, "type": "tool", "title": "\u901a\u7528\u6587\u672c\u5206\u5757", "provider_id": "langgenius/general_chunk/general_chunk", "provider_type": "builtin", "provider_name": "langgenius/general_chunk/general_chunk", "tool_name": "general_chunk", "tool_label": "\u901a\u7528\u6587\u672c\u5206\u5757", "tool_description": "\u4e00\u4e2a\u7528\u4e8e\u901a\u7528\u6587\u672c\u5206\u5757\u6a21\u5f0f\u7684\u5de5\u5177\uff0c\u68c0\u7d22\u548c\u53ec\u56de\u7684\u5757\u662f\u76f8\u540c\u7684\u3002", "is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "The result of the general chunk tool.", "properties": {"general_chunks": {"items": {"description": "The chunk of the text.", "type": "string"}, "type": "array"}}, "type": "object"}}, "type": "object"}, "paramSchemas": [{"name": "input_variable", "label": {"en_US": "Input Variable", "zh_Hans": "\u8f93\u5165\u53d8\u91cf", "pt_BR": "Input Variable", "ja_JP": "Input Variable"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002", "pt_BR": "The text you want to chunk.", "ja_JP": "The text you want to chunk."}, "form": "llm", "llm_description": "The text you want to chunk."}, {"name": "delimiter", "label": {"en_US": "Delimiter", "zh_Hans": "\u5206\u9694\u7b26", "pt_BR": "Delimiter", "ja_JP": "Delimiter"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "string", "human_description": {"en_US": "The delimiter of the chunks.", "zh_Hans": "\u5757\u7684\u5206\u9694\u7b26\u3002", "pt_BR": "The delimiter of the chunks.", "ja_JP": "The delimiter of the chunks."}, "form": "llm", "llm_description": "The delimiter of the chunks, the format of the delimiter must be a string."}, {"name": "max_chunk_length", "label": {"en_US": "Maximum Chunk Length", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6", "pt_BR": "Maximum Chunk Length", "ja_JP": "Maximum Chunk Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": true, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The maximum chunk length.", "zh_Hans": "\u6700\u5927\u5757\u7684\u957f\u5ea6\u3002", "pt_BR": "The maximum chunk length.", "ja_JP": "The maximum chunk length."}, "form": "llm", "llm_description": "The maximum chunk length, the format of the chunk size must be an integer."}, {"name": "chunk_overlap_length", "label": {"en_US": "Chunk Overlap Length", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6", "pt_BR": "Chunk Overlap Length", "ja_JP": "Chunk Overlap Length"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "number", "human_description": {"en_US": "The chunk overlap length.", "zh_Hans": "\u5757\u7684\u91cd\u53e0\u957f\u5ea6\u3002", "pt_BR": "The chunk overlap length.", "ja_JP": "The chunk overlap length."}, "form": "llm", "llm_description": "The chunk overlap length, the format of the chunk overlap length must be an integer."}, {"name": "replace_consecutive_spaces_newlines_tabs", "label": {"en_US": "Replace Consecutive Spaces, Newlines and Tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace Consecutive Spaces, Newlines and Tabs", "ja_JP": "Replace Consecutive Spaces, Newlines and Tabs"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Replace consecutive spaces, newlines and tabs", "zh_Hans": "\u66ff\u6362\u8fde\u7eed\u7684\u7a7a\u683c\u3001\u6362\u884c\u7b26\u548c\u5236\u8868\u7b26", "pt_BR": "Replace consecutive spaces, newlines and tabs", "ja_JP": "Replace consecutive spaces, newlines and tabs"}, "form": "llm", "llm_description": "Replace consecutive spaces, newlines and tabs, the format of the replace must be a boolean."}, {"name": "delete_all_urls_and_email_addresses", "label": {"en_US": "Delete All URLs and Email Addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete All URLs and Email Addresses", "ja_JP": "Delete All URLs and Email Addresses"}, "placeholder": null, "scope": null, "auto_generate": null, "template": null, "required": false, "default": null, "min": null, "max": null, "precision": null, "options": [], "type": "boolean", "human_description": {"en_US": "Delete all URLs and email addresses", "zh_Hans": "\u5220\u9664\u6240\u6709URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740", "pt_BR": "Delete all URLs and email addresses", "ja_JP": "Delete all URLs and email addresses"}, "form": "llm", "llm_description": "Delete all URLs and email addresses, the format of the delete must be a boolean."}], "params": {"input_variable": "", "delimiter": "", "max_chunk_length": "", "chunk_overlap_length": "", "replace_consecutive_spaces_newlines_tabs": "", "delete_all_urls_and_email_addresses": ""}, "selected": false}, "position": {"x": 1807.4306671642219, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1807.4306671642219, "y": 281.3910724383104}, "width": 242, "height": 52}], "edges": [{"data": {"isInIteration": false, "isInLoop": false, "sourceType": "datasource", "targetType": "variable-aggregator"}, "id": "1752491761974-source-1752565435219-target", "source": "1752491761974", "sourceHandle": "source", "target": "1752565435219", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"data": {"isInLoop": false, "sourceType": "datasource", "targetType": "variable-aggregator"}, "id": "1752565402678-source-1752565435219-target", "source": "1752565402678", "sourceHandle": "source", "target": "1752565435219", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"id": "1752565435219-source-1752569675978-target", "type": "custom", "source": "1752565435219", "sourceHandle": "source", "target": "1752569675978", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752569675978-source-1752477924228-target", "type": "custom", "source": "1752569675978", "target": "1752477924228", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "tool", "targetType": "knowledge-index", "isInLoop": false}, "zIndex": 0}], "viewport": {"x": -707.721097109337, "y": -93.07807382100896, "zoom": 0.9350632198875476}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/web_crawl_parent_child.json b/api/services/rag_pipeline/transform/web_crawl_parent_child.json new file mode 100644 index 0000000000..11a5a7cb48 --- /dev/null +++ b/api/services/rag_pipeline/transform/web_crawl_parent_child.json @@ -0,0 +1 @@ +{"nodes": [{"data": {"chunk_structure": "hierarchical_model", "embedding_model": "text-embedding-ada-002", "embedding_model_provider": "langgenius/openai/openai", "index_chunk_variable_selector": ["1752490343805", "result"], "indexing_technique": "high_quality", "keyword_number": 10, "retrieval_model": {"score_threshold": 0.5, "score_threshold_enabled": false, "search_method": "semantic_search", "top_k": 3, "vector_setting": {"embedding_model_name": "text-embedding-ada-002", "embedding_provider_name": "langgenius/openai/openai"}}, "selected": false, "title": "\u77e5\u8bc6\u5e93", "type": "knowledge-index"}, "height": 114, "id": "1752477924228", "position": {"x": 2215.5544306817387, "y": 281.3910724383104}, "positionAbsolute": {"x": 2215.5544306817387, "y": 281.3910724383104}, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"data": {"is_team_authorization": true, "output_schema": {"properties": {"result": {"description": "Parent child chunks result", "items": {"type": "object"}, "type": "array"}}, "type": "object"}, "paramSchemas": [{"auto_generate": null, "default": null, "form": "llm", "human_description": {"en_US": "The text you want to chunk.", "ja_JP": "The text you want to chunk.", "pt_BR": "The text you want to chunk.", "zh_Hans": "\u4f60\u60f3\u8981\u5206\u5757\u7684\u6587\u672c\u3002"}, "label": {"en_US": "Input text", "ja_JP": "Input text", "pt_BR": "Input text", "zh_Hans": "\u8f93\u5165\u6587\u672c"}, "llm_description": "The text you want to chunk.", "max": null, "min": null, "name": "input_text", "options": [], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "string"}, {"auto_generate": null, "default": 1024, "form": "llm", "human_description": {"en_US": "Maximum length for chunking", "ja_JP": "Maximum length for chunking", "pt_BR": "Comprimento m\u00e1ximo para divis\u00e3o", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6"}, "label": {"en_US": "Maximum Length", "ja_JP": "Maximum Length", "pt_BR": "Comprimento M\u00e1ximo", "zh_Hans": "\u6700\u5927\u957f\u5ea6"}, "llm_description": "Maximum length allowed per chunk", "max": null, "min": null, "name": "max_length", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "number"}, {"auto_generate": null, "default": "\n\n", "form": "llm", "human_description": {"en_US": "Separator used for chunking", "ja_JP": "Separator used for chunking", "pt_BR": "Separador usado para divis\u00e3o", "zh_Hans": "\u7528\u4e8e\u5206\u5757\u7684\u5206\u9694\u7b26"}, "label": {"en_US": "Chunk Separator", "ja_JP": "Chunk Separator", "pt_BR": "Separador de Divis\u00e3o", "zh_Hans": "\u5206\u5757\u5206\u9694\u7b26"}, "llm_description": "The separator used to split chunks", "max": null, "min": null, "name": "separator", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "string"}, {"auto_generate": null, "default": 512, "form": "llm", "human_description": {"en_US": "Maximum length for subchunking", "ja_JP": "Maximum length for subchunking", "pt_BR": "Comprimento m\u00e1ximo para subdivis\u00e3o", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u6700\u5927\u957f\u5ea6"}, "label": {"en_US": "Subchunk Maximum Length", "ja_JP": "Subchunk Maximum Length", "pt_BR": "Comprimento M\u00e1ximo de Subdivis\u00e3o", "zh_Hans": "\u5b50\u5206\u5757\u6700\u5927\u957f\u5ea6"}, "llm_description": "Maximum length allowed per subchunk", "max": null, "min": null, "name": "subchunk_max_length", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "number"}, {"auto_generate": null, "default": ". ", "form": "llm", "human_description": {"en_US": "Separator used for subchunking", "ja_JP": "Separator used for subchunking", "pt_BR": "Separador usado para subdivis\u00e3o", "zh_Hans": "\u7528\u4e8e\u5b50\u5206\u5757\u7684\u5206\u9694\u7b26"}, "label": {"en_US": "Subchunk Separator", "ja_JP": "Subchunk Separator", "pt_BR": "Separador de Subdivis\u00e3o", "zh_Hans": "\u5b50\u5206\u5757\u5206\u9694\u7b26"}, "llm_description": "The separator used to split subchunks", "max": null, "min": null, "name": "subchunk_separator", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "string"}, {"auto_generate": null, "default": "paragraph", "form": "llm", "human_description": {"en_US": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve.", "ja_JP": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve.", "pt_BR": "Dividir texto em par\u00e1grafos com base no separador e no comprimento m\u00e1ximo do bloco, usando o texto dividido como bloco pai ou documento completo como bloco pai e diretamente recuper\u00e1-lo.", "zh_Hans": "\u6839\u636e\u5206\u9694\u7b26\u548c\u6700\u5927\u5757\u957f\u5ea6\u5c06\u6587\u672c\u62c6\u5206\u4e3a\u6bb5\u843d\uff0c\u4f7f\u7528\u62c6\u5206\u6587\u672c\u4f5c\u4e3a\u68c0\u7d22\u7684\u7236\u5757\u6216\u6574\u4e2a\u6587\u6863\u7528\u4f5c\u7236\u5757\u5e76\u76f4\u63a5\u68c0\u7d22\u3002"}, "label": {"en_US": "Parent Mode", "ja_JP": "Parent Mode", "pt_BR": "Modo Pai", "zh_Hans": "\u7236\u5757\u6a21\u5f0f"}, "llm_description": "Split text into paragraphs based on separator and maximum chunk length, using split text as parent block or entire document as parent block and directly retrieve.", "max": null, "min": null, "name": "parent_mode", "options": [{"icon": "", "label": {"en_US": "Paragraph", "ja_JP": "Paragraph", "pt_BR": "Par\u00e1grafo", "zh_Hans": "\u6bb5\u843d"}, "value": "paragraph"}, {"icon": "", "label": {"en_US": "Full Document", "ja_JP": "Full Document", "pt_BR": "Documento Completo", "zh_Hans": "\u5168\u6587"}, "value": "full_doc"}], "placeholder": null, "precision": null, "required": true, "scope": null, "template": null, "type": "select"}, {"auto_generate": null, "default": 0, "form": "llm", "human_description": {"en_US": "Whether to remove extra spaces in the text", "ja_JP": "Whether to remove extra spaces in the text", "pt_BR": "Se deve remover espa\u00e7os extras no texto", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684\u591a\u4f59\u7a7a\u683c"}, "label": {"en_US": "Remove Extra Spaces", "ja_JP": "Remove Extra Spaces", "pt_BR": "Remover Espa\u00e7os Extras", "zh_Hans": "\u79fb\u9664\u591a\u4f59\u7a7a\u683c"}, "llm_description": "Whether to remove extra spaces in the text", "max": null, "min": null, "name": "remove_extra_spaces", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "boolean"}, {"auto_generate": null, "default": 0, "form": "llm", "human_description": {"en_US": "Whether to remove URLs and emails in the text", "ja_JP": "Whether to remove URLs and emails in the text", "pt_BR": "Se deve remover URLs e e-mails no texto", "zh_Hans": "\u662f\u5426\u79fb\u9664\u6587\u672c\u4e2d\u7684URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740"}, "label": {"en_US": "Remove URLs and Emails", "ja_JP": "Remove URLs and Emails", "pt_BR": "Remover URLs e E-mails", "zh_Hans": "\u79fb\u9664URL\u548c\u7535\u5b50\u90ae\u4ef6\u5730\u5740"}, "llm_description": "Whether to remove URLs and emails in the text", "max": null, "min": null, "name": "remove_urls_emails", "options": [], "placeholder": null, "precision": null, "required": false, "scope": null, "template": null, "type": "boolean"}], "params": {"input_text": "", "max_length": "", "parent_mode": "", "remove_extra_spaces": "", "remove_urls_emails": "", "separator": "", "subchunk_max_length": "", "subchunk_separator": ""}, "provider_id": "langgenius/parent_child_chunk/parent_child_chunk", "provider_name": "langgenius/parent_child_chunk/parent_child_chunk", "provider_type": "builtin", "selected": true, "title": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "tool_configurations": {}, "tool_description": "\u5c06\u6587\u6863\u5904\u7406\u4e3a\u7236\u5b50\u5206\u5757\u7ed3\u6784", "tool_label": "\u7236\u5b50\u5206\u5757\u5904\u7406\u5668", "tool_name": "parent_child_chunk", "tool_parameters": {"input_text": {"type": "mixed", "value": "{{#1752565435219.output#}}"}, "max_length": {"type": "variable", "value": ["rag", "shared", "max_chunk_length"]}, "parent_mode": {"type": "variable", "value": ["rag", "shared", "parent_mode"]}, "separator": {"type": "mixed", "value": "{{#rag.shared.delimiter#}}"}, "subchunk_max_length": {"type": "variable", "value": ["rag", "shared", "child_max_chunk_length"]}, "subchunk_separator": {"type": "mixed", "value": "{{#rag.shared.child_delimiter#}}"}, "remove_extra_spaces": {"type": "mixed", "value": "{{#rag.shared.replace_consecutive_spaces#}}"}, "remove_urls_emails": {"type": "mixed", "value": "{{#rag.shared.delete_urls_email#}}"}}, "type": "tool"}, "height": 52, "id": "1752490343805", "position": {"x": 1853.5260563244174, "y": 281.3910724383104}, "positionAbsolute": {"x": 1853.5260563244174, "y": 281.3910724383104}, "selected": true, "sourcePosition": "right", "targetPosition": "left", "type": "custom", "width": 242}, {"id": "1752491761974", "type": "custom", "data": {"datasource_parameters": {"url": {"type": "mixed", "value": "{{#rag.1752491761974.jina_url#}}"}, "crawl_sub_pages": {"type": "mixed", "value": "{{#rag.1752491761974.jina_crawl_sub_pages#}}"}, "limit": {"type": "variable", "value": ["rag", "1752491761974", "jina_limit"]}, "use_sitemap": {"type": "mixed", "value": "{{#rag.1752491761974.jina_use_sitemap#}}"}}, "datasource_configurations": {}, "type": "datasource", "title": "Jina Reader", "plugin_id": "langgenius/jina_datasource", "provider_type": "website_crawl", "provider_name": "jina", "datasource_name": "jina_reader", "datasource_label": "Jina Reader", "selected": false}, "position": {"x": 1067.7526055798794, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1067.7526055798794, "y": 281.3910724383104}, "width": 242, "height": 52, "selected": false}, {"id": "1752565402678", "type": "custom", "data": {"datasource_parameters": {"url": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_url#}}"}, "crawl_subpages": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}"}, "exclude_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_exclude_paths#}}"}, "include_paths": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_include_only_paths#}}"}, "max_depth": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_max_depth"]}, "limit": {"type": "variable", "value": ["rag", "1752565402678", "firecrawl_limit"]}, "only_main_content": {"type": "mixed", "value": "{{#rag.1752565402678.firecrawl_extract_main_content#}}"}}, "datasource_configurations": {}, "type": "datasource", "title": "Firecrawl", "plugin_id": "langgenius/firecrawl_datasource", "provider_type": "website_crawl", "provider_name": "firecrawl", "datasource_name": "crawl", "datasource_label": "Firecrawl", "selected": false}, "position": {"x": 1067.7526055798794, "y": 417.32608398342404}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1067.7526055798794, "y": 417.32608398342404}, "width": 242, "height": 52, "selected": false}, {"id": "1752565435219", "type": "custom", "data": {"output_type": "string", "variables": [["1752491761974", "content"], ["1752565402678", "content"]], "type": "variable-aggregator", "title": "\u53d8\u91cf\u805a\u5408\u5668", "selected": false}, "position": {"x": 1505.4306671642219, "y": 281.3910724383104}, "targetPosition": "left", "sourcePosition": "right", "positionAbsolute": {"x": 1505.4306671642219, "y": 281.3910724383104}, "width": 242, "height": 129, "selected": false}], "edges": [{"data": {"isInLoop": false, "sourceType": "tool", "targetType": "knowledge-index"}, "id": "1752490343805-source-1752477924228-target", "source": "1752490343805", "sourceHandle": "source", "target": "1752477924228", "targetHandle": "target", "type": "custom", "zIndex": 0}, {"id": "1752491761974-source-1752565435219-target", "type": "custom", "source": "1752491761974", "sourceHandle": "source", "target": "1752565435219", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "variable-aggregator", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752565435219-source-1752490343805-target", "type": "custom", "source": "1752565435219", "sourceHandle": "source", "target": "1752490343805", "targetHandle": "target", "data": {"sourceType": "variable-aggregator", "targetType": "tool", "isInIteration": false, "isInLoop": false}, "zIndex": 0}, {"id": "1752565402678-source-1752565435219-target", "type": "custom", "source": "1752565402678", "target": "1752565435219", "sourceHandle": "source", "targetHandle": "target", "data": {"sourceType": "datasource", "targetType": "variable-aggregator", "isInLoop": false}, "zIndex": 0}], "viewport": {"x": -826.1791044466438, "y": -71.91725474841303, "zoom": 0.9980166672552107}} \ No newline at end of file diff --git a/api/services/rag_pipeline/transform/website-crawl-general-economy.yml b/api/services/rag_pipeline/transform/website-crawl-general-economy.yml new file mode 100644 index 0000000000..acdc0fe52a --- /dev/null +++ b/api/services/rag_pipeline/transform/website-crawl-general-economy.yml @@ -0,0 +1,666 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '' + icon_type: emoji + name: website-crawl-general-economy +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752491761974-source-1752565435219-target + source: '1752491761974' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752565402678-source-1752565435219-target + source: '1752565402678' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752565435219-source-1752569675978-target + source: '1752565435219' + sourceHandle: source + target: '1752569675978' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752569675978-source-1752477924228-target + source: '1752569675978' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752569675978' + - result + indexing_technique: economy + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: keyword_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: true + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 2140.4053851189346 + y: 281.3910724383104 + positionAbsolute: + x: 2140.4053851189346 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Jina Reader + datasource_name: jina_reader + datasource_parameters: + crawl_sub_pages: + type: mixed + value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}' + limit: + type: variable + value: + - rag + - '1752491761974' + - jina_limit + url: + type: mixed + value: '{{#rag.1752491761974.jina_url#}}' + use_sitemap: + type: mixed + value: '{{#rag.1752491761974.jina_use_sitemap#}}' + plugin_id: langgenius/jina_datasource + provider_name: jina + provider_type: website_crawl + selected: false + title: Jina Reader + type: datasource + height: 52 + id: '1752491761974' + position: + x: 1067.7526055798794 + y: 281.3910724383104 + positionAbsolute: + x: 1067.7526055798794 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Firecrawl + datasource_name: crawl + datasource_parameters: + crawl_subpages: + type: mixed + value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}' + exclude_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}' + include_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}' + limit: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_limit + max_depth: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_max_depth + only_main_content: + type: mixed + value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}' + url: + type: mixed + value: '{{#rag.1752565402678.firecrawl_url#}}' + plugin_id: langgenius/firecrawl_datasource + provider_name: firecrawl + provider_type: website_crawl + selected: false + title: Firecrawl + type: datasource + height: 52 + id: '1752565402678' + position: + x: 1067.7526055798794 + y: 417.32608398342404 + positionAbsolute: + x: 1067.7526055798794 + y: 417.32608398342404 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752491761974' + - content + - - '1752565402678' + - content + height: 129 + id: '1752565435219' + position: + x: 1505.4306671642219 + y: 281.3910724383104 + positionAbsolute: + x: 1505.4306671642219 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752565435219.output#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752569675978' + position: + x: 1807.4306671642219 + y: 281.3910724383104 + positionAbsolute: + x: 1807.4306671642219 + y: 281.3910724383104 + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -707.721097109337 + y: -93.07807382100896 + zoom: 0.9350632198875476 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: jina_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: jina_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: jina_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Use sitemap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl + iteratively based on page relevance, yielding fewer but higher-quality pages. + type: checkbox + unit: null + variable: jina_use_sitemap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: firecrawl_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: true + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: firecrawl_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Max depth + max_length: 48 + options: [] + placeholder: '' + required: false + tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes + the page of the entered url, depth 1 scrapes the url and everything after enteredURL + + one /, and so on. + type: number + unit: null + variable: firecrawl_max_depth + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Exclude paths + max_length: 256 + options: [] + placeholder: blog/*, /about/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_exclude_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Include only paths + max_length: 256 + options: [] + placeholder: articles/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_include_only_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: firecrawl_extract_main_content + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_extract_main_content + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n\n + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 1024 + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 50 + label: chunk_overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: Setting the chunk overlap can maintain the semantic relevance between + them, enhancing the retrieve effect. It is recommended to set 10%–25% of the + maximum chunk size. + type: number + unit: characters + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: replace_consecutive_spaces + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml b/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml new file mode 100644 index 0000000000..35e6fa5e8f --- /dev/null +++ b/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml @@ -0,0 +1,666 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '#FFF4ED' + icon_type: emoji + name: website-crawl-general-high-quality +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752491761974-source-1752565435219-target + source: '1752491761974' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752565402678-source-1752565435219-target + source: '1752565402678' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752565435219-source-1752569675978-target + source: '1752565435219' + sourceHandle: source + target: '1752569675978' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752569675978-source-1752477924228-target + source: '1752569675978' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: text_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752569675978' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: false + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 2140.4053851189346 + y: 281.3910724383104 + positionAbsolute: + x: 2140.4053851189346 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Jina Reader + datasource_name: jina_reader + datasource_parameters: + crawl_sub_pages: + type: mixed + value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}' + limit: + type: variable + value: + - rag + - '1752491761974' + - jina_limit + url: + type: mixed + value: '{{#rag.1752491761974.jina_url#}}' + use_sitemap: + type: mixed + value: '{{#rag.1752491761974.jina_use_sitemap#}}' + plugin_id: langgenius/jina_datasource + provider_name: jina + provider_type: website_crawl + selected: false + title: Jina Reader + type: datasource + height: 52 + id: '1752491761974' + position: + x: 1067.7526055798794 + y: 281.3910724383104 + positionAbsolute: + x: 1067.7526055798794 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Firecrawl + datasource_name: crawl + datasource_parameters: + crawl_subpages: + type: mixed + value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}' + exclude_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}' + include_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}' + limit: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_limit + max_depth: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_max_depth + only_main_content: + type: mixed + value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}' + url: + type: mixed + value: '{{#rag.1752565402678.firecrawl_url#}}' + plugin_id: langgenius/firecrawl_datasource + provider_name: firecrawl + provider_type: website_crawl + selected: false + title: Firecrawl + type: datasource + height: 52 + id: '1752565402678' + position: + x: 1067.7526055798794 + y: 417.32608398342404 + positionAbsolute: + x: 1067.7526055798794 + y: 417.32608398342404 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752491761974' + - content + - - '1752565402678' + - content + height: 129 + id: '1752565435219' + position: + x: 1505.4306671642219 + y: 281.3910724383104 + positionAbsolute: + x: 1505.4306671642219 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: The result of the general chunk tool. + properties: + general_chunks: + items: + description: The chunk of the text. + type: string + type: array + type: object + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input Variable + ja_JP: Input Variable + pt_BR: Input Variable + zh_Hans: 输入变量 + llm_description: The text you want to chunk. + max: null + min: null + name: input_variable + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The delimiter of the chunks. + ja_JP: The delimiter of the chunks. + pt_BR: The delimiter of the chunks. + zh_Hans: 块的分隔符。 + label: + en_US: Delimiter + ja_JP: Delimiter + pt_BR: Delimiter + zh_Hans: 分隔符 + llm_description: The delimiter of the chunks, the format of the delimiter + must be a string. + max: null + min: null + name: delimiter + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: null + form: llm + human_description: + en_US: The maximum chunk length. + ja_JP: The maximum chunk length. + pt_BR: The maximum chunk length. + zh_Hans: 最大块的长度。 + label: + en_US: Maximum Chunk Length + ja_JP: Maximum Chunk Length + pt_BR: Maximum Chunk Length + zh_Hans: 最大块的长度 + llm_description: The maximum chunk length, the format of the chunk size + must be an integer. + max: null + min: null + name: max_chunk_length + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: The chunk overlap length. + ja_JP: The chunk overlap length. + pt_BR: The chunk overlap length. + zh_Hans: 块的重叠长度。 + label: + en_US: Chunk Overlap Length + ja_JP: Chunk Overlap Length + pt_BR: Chunk Overlap Length + zh_Hans: 块的重叠长度 + llm_description: The chunk overlap length, the format of the chunk overlap + length must be an integer. + max: null + min: null + name: chunk_overlap_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: null + form: llm + human_description: + en_US: Replace consecutive spaces, newlines and tabs + ja_JP: Replace consecutive spaces, newlines and tabs + pt_BR: Replace consecutive spaces, newlines and tabs + zh_Hans: 替换连续的空格、换行符和制表符 + label: + en_US: Replace Consecutive Spaces, Newlines and Tabs + ja_JP: Replace Consecutive Spaces, Newlines and Tabs + pt_BR: Replace Consecutive Spaces, Newlines and Tabs + zh_Hans: 替换连续的空格、换行符和制表符 + llm_description: Replace consecutive spaces, newlines and tabs, the format + of the replace must be a boolean. + max: null + min: null + name: replace_consecutive_spaces_newlines_tabs + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: null + form: llm + human_description: + en_US: Delete all URLs and email addresses + ja_JP: Delete all URLs and email addresses + pt_BR: Delete all URLs and email addresses + zh_Hans: 删除所有URL和电子邮件地址 + label: + en_US: Delete All URLs and Email Addresses + ja_JP: Delete All URLs and Email Addresses + pt_BR: Delete All URLs and Email Addresses + zh_Hans: 删除所有URL和电子邮件地址 + llm_description: Delete all URLs and email addresses, the format of the + delete must be a boolean. + max: null + min: null + name: delete_all_urls_and_email_addresses + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + chunk_overlap_length: '' + delete_all_urls_and_email_addresses: '' + delimiter: '' + input_variable: '' + max_chunk_length: '' + replace_consecutive_spaces_newlines_tabs: '' + provider_id: langgenius/general_chunk/general_chunk + provider_name: langgenius/general_chunk/general_chunk + provider_type: builtin + selected: false + title: 通用文本分块 + tool_configurations: {} + tool_description: 一个用于通用文本分块模式的工具,检索和召回的块是相同的。 + tool_label: 通用文本分块 + tool_name: general_chunk + tool_parameters: + chunk_overlap_length: + type: variable + value: + - rag + - shared + - chunk_overlap + delete_all_urls_and_email_addresses: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + delimiter: + type: mixed + value: '{{#rag.shared.delimiter#}}' + input_variable: + type: mixed + value: '{{#1752565435219.output#}}' + max_chunk_length: + type: variable + value: + - rag + - shared + - max_chunk_length + replace_consecutive_spaces_newlines_tabs: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + type: tool + height: 52 + id: '1752569675978' + position: + x: 1807.4306671642219 + y: 281.3910724383104 + positionAbsolute: + x: 1807.4306671642219 + y: 281.3910724383104 + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -707.721097109337 + y: -93.07807382100896 + zoom: 0.9350632198875476 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: jina_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: jina_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: jina_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Use sitemap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl + iteratively based on page relevance, yielding fewer but higher-quality pages. + type: checkbox + unit: null + variable: jina_use_sitemap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: firecrawl_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: true + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: firecrawl_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Max depth + max_length: 48 + options: [] + placeholder: '' + required: false + tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes + the page of the entered url, depth 1 scrapes the url and everything after enteredURL + + one /, and so on. + type: number + unit: null + variable: firecrawl_max_depth + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Exclude paths + max_length: 256 + options: [] + placeholder: blog/*, /about/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_exclude_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Include only paths + max_length: 256 + options: [] + placeholder: articles/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_include_only_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: firecrawl_extract_main_content + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_extract_main_content + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n\n + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 1024 + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 50 + label: chunk_overlap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: Setting the chunk overlap can maintain the semantic relevance between + them, enhancing the retrieve effect. It is recommended to set 10%–25% of the + maximum chunk size. + type: number + unit: characters + variable: chunk_overlap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: replace_consecutive_spaces + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email diff --git a/api/services/rag_pipeline/transform/website-crawl-parentchild.yml b/api/services/rag_pipeline/transform/website-crawl-parentchild.yml new file mode 100644 index 0000000000..521bb67c38 --- /dev/null +++ b/api/services/rag_pipeline/transform/website-crawl-parentchild.yml @@ -0,0 +1,772 @@ +dependencies: +- current_identifier: null + type: package + value: + plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510 +kind: rag_pipeline +rag_pipeline: + description: '' + icon: 📙 + icon_background: '' + icon_type: emoji + name: website-crawl-parentchild +version: 0.1.0 +workflow: + conversation_variables: [] + environment_variables: [] + features: {} + graph: + edges: + - data: + isInLoop: false + sourceType: tool + targetType: knowledge-index + id: 1752490343805-source-1752477924228-target + source: '1752490343805' + sourceHandle: source + target: '1752477924228' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752491761974-source-1752565435219-target + source: '1752491761974' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInIteration: false + isInLoop: false + sourceType: variable-aggregator + targetType: tool + id: 1752565435219-source-1752490343805-target + source: '1752565435219' + sourceHandle: source + target: '1752490343805' + targetHandle: target + type: custom + zIndex: 0 + - data: + isInLoop: false + sourceType: datasource + targetType: variable-aggregator + id: 1752565402678-source-1752565435219-target + source: '1752565402678' + sourceHandle: source + target: '1752565435219' + targetHandle: target + type: custom + zIndex: 0 + nodes: + - data: + chunk_structure: hierarchical_model + embedding_model: text-embedding-ada-002 + embedding_model_provider: langgenius/openai/openai + index_chunk_variable_selector: + - '1752490343805' + - result + indexing_technique: high_quality + keyword_number: 10 + retrieval_model: + score_threshold: 0.5 + score_threshold_enabled: false + search_method: semantic_search + top_k: 3 + vector_setting: + embedding_model_name: text-embedding-ada-002 + embedding_provider_name: langgenius/openai/openai + selected: false + title: 知识库 + type: knowledge-index + height: 114 + id: '1752477924228' + position: + x: 2215.5544306817387 + y: 281.3910724383104 + positionAbsolute: + x: 2215.5544306817387 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + is_team_authorization: true + output_schema: + properties: + result: + description: Parent child chunks result + items: + type: object + type: array + type: object + paramSchemas: + - auto_generate: null + default: null + form: llm + human_description: + en_US: The text you want to chunk. + ja_JP: The text you want to chunk. + pt_BR: The text you want to chunk. + zh_Hans: 你想要分块的文本。 + label: + en_US: Input text + ja_JP: Input text + pt_BR: Input text + zh_Hans: 输入文本 + llm_description: The text you want to chunk. + max: null + min: null + name: input_text + options: [] + placeholder: null + precision: null + required: true + scope: null + template: null + type: string + - auto_generate: null + default: 1024 + form: llm + human_description: + en_US: Maximum length for chunking + ja_JP: Maximum length for chunking + pt_BR: Comprimento máximo para divisão + zh_Hans: 用于分块的最大长度 + label: + en_US: Maximum Length + ja_JP: Maximum Length + pt_BR: Comprimento Máximo + zh_Hans: 最大长度 + llm_description: Maximum length allowed per chunk + max: null + min: null + name: max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: ' + + + ' + form: llm + human_description: + en_US: Separator used for chunking + ja_JP: Separator used for chunking + pt_BR: Separador usado para divisão + zh_Hans: 用于分块的分隔符 + label: + en_US: Chunk Separator + ja_JP: Chunk Separator + pt_BR: Separador de Divisão + zh_Hans: 分块分隔符 + llm_description: The separator used to split chunks + max: null + min: null + name: separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: 512 + form: llm + human_description: + en_US: Maximum length for subchunking + ja_JP: Maximum length for subchunking + pt_BR: Comprimento máximo para subdivisão + zh_Hans: 用于子分块的最大长度 + label: + en_US: Subchunk Maximum Length + ja_JP: Subchunk Maximum Length + pt_BR: Comprimento Máximo de Subdivisão + zh_Hans: 子分块最大长度 + llm_description: Maximum length allowed per subchunk + max: null + min: null + name: subchunk_max_length + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: number + - auto_generate: null + default: '. ' + form: llm + human_description: + en_US: Separator used for subchunking + ja_JP: Separator used for subchunking + pt_BR: Separador usado para subdivisão + zh_Hans: 用于子分块的分隔符 + label: + en_US: Subchunk Separator + ja_JP: Subchunk Separator + pt_BR: Separador de Subdivisão + zh_Hans: 子分块分隔符 + llm_description: The separator used to split subchunks + max: null + min: null + name: subchunk_separator + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: string + - auto_generate: null + default: paragraph + form: llm + human_description: + en_US: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + ja_JP: Split text into paragraphs based on separator and maximum chunk + length, using split text as parent block or entire document as parent + block and directly retrieve. + pt_BR: Dividir texto em parágrafos com base no separador e no comprimento + máximo do bloco, usando o texto dividido como bloco pai ou documento + completo como bloco pai e diretamente recuperá-lo. + zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。 + label: + en_US: Parent Mode + ja_JP: Parent Mode + pt_BR: Modo Pai + zh_Hans: 父块模式 + llm_description: Split text into paragraphs based on separator and maximum + chunk length, using split text as parent block or entire document as parent + block and directly retrieve. + max: null + min: null + name: parent_mode + options: + - icon: '' + label: + en_US: Paragraph + ja_JP: Paragraph + pt_BR: Parágrafo + zh_Hans: 段落 + value: paragraph + - icon: '' + label: + en_US: Full Document + ja_JP: Full Document + pt_BR: Documento Completo + zh_Hans: 全文 + value: full_doc + placeholder: null + precision: null + required: true + scope: null + template: null + type: select + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove extra spaces in the text + ja_JP: Whether to remove extra spaces in the text + pt_BR: Se deve remover espaços extras no texto + zh_Hans: 是否移除文本中的多余空格 + label: + en_US: Remove Extra Spaces + ja_JP: Remove Extra Spaces + pt_BR: Remover Espaços Extras + zh_Hans: 移除多余空格 + llm_description: Whether to remove extra spaces in the text + max: null + min: null + name: remove_extra_spaces + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + - auto_generate: null + default: 0 + form: llm + human_description: + en_US: Whether to remove URLs and emails in the text + ja_JP: Whether to remove URLs and emails in the text + pt_BR: Se deve remover URLs e e-mails no texto + zh_Hans: 是否移除文本中的URL和电子邮件地址 + label: + en_US: Remove URLs and Emails + ja_JP: Remove URLs and Emails + pt_BR: Remover URLs e E-mails + zh_Hans: 移除URL和电子邮件地址 + llm_description: Whether to remove URLs and emails in the text + max: null + min: null + name: remove_urls_emails + options: [] + placeholder: null + precision: null + required: false + scope: null + template: null + type: boolean + params: + input_text: '' + max_length: '' + parent_mode: '' + remove_extra_spaces: '' + remove_urls_emails: '' + separator: '' + subchunk_max_length: '' + subchunk_separator: '' + provider_id: langgenius/parent_child_chunk/parent_child_chunk + provider_name: langgenius/parent_child_chunk/parent_child_chunk + provider_type: builtin + selected: true + title: 父子分块处理器 + tool_configurations: {} + tool_description: 将文档处理为父子分块结构 + tool_label: 父子分块处理器 + tool_name: parent_child_chunk + tool_parameters: + input_text: + type: mixed + value: '{{#1752565435219.output#}}' + max_length: + type: variable + value: + - rag + - shared + - max_chunk_length + parent_mode: + type: variable + value: + - rag + - shared + - parent_mode + remove_extra_spaces: + type: mixed + value: '{{#rag.shared.replace_consecutive_spaces#}}' + remove_urls_emails: + type: mixed + value: '{{#rag.shared.delete_urls_email#}}' + separator: + type: mixed + value: '{{#rag.shared.delimiter#}}' + subchunk_max_length: + type: variable + value: + - rag + - shared + - child_max_chunk_length + subchunk_separator: + type: mixed + value: '{{#rag.shared.child_delimiter#}}' + type: tool + height: 52 + id: '1752490343805' + position: + x: 1853.5260563244174 + y: 281.3910724383104 + positionAbsolute: + x: 1853.5260563244174 + y: 281.3910724383104 + selected: true + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Jina Reader + datasource_name: jina_reader + datasource_parameters: + crawl_sub_pages: + type: mixed + value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}' + limit: + type: variable + value: + - rag + - '1752491761974' + - jina_limit + url: + type: mixed + value: '{{#rag.1752491761974.jina_url#}}' + use_sitemap: + type: mixed + value: '{{#rag.1752491761974.jina_use_sitemap#}}' + plugin_id: langgenius/jina_datasource + provider_name: jina + provider_type: website_crawl + selected: false + title: Jina Reader + type: datasource + height: 52 + id: '1752491761974' + position: + x: 1067.7526055798794 + y: 281.3910724383104 + positionAbsolute: + x: 1067.7526055798794 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + datasource_configurations: {} + datasource_label: Firecrawl + datasource_name: crawl + datasource_parameters: + crawl_subpages: + type: mixed + value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}' + exclude_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}' + include_paths: + type: mixed + value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}' + limit: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_limit + max_depth: + type: variable + value: + - rag + - '1752565402678' + - firecrawl_max_depth + only_main_content: + type: mixed + value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}' + url: + type: mixed + value: '{{#rag.1752565402678.firecrawl_url#}}' + plugin_id: langgenius/firecrawl_datasource + provider_name: firecrawl + provider_type: website_crawl + selected: false + title: Firecrawl + type: datasource + height: 52 + id: '1752565402678' + position: + x: 1067.7526055798794 + y: 417.32608398342404 + positionAbsolute: + x: 1067.7526055798794 + y: 417.32608398342404 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + - data: + output_type: string + selected: false + title: 变量聚合器 + type: variable-aggregator + variables: + - - '1752491761974' + - content + - - '1752565402678' + - content + height: 129 + id: '1752565435219' + position: + x: 1505.4306671642219 + y: 281.3910724383104 + positionAbsolute: + x: 1505.4306671642219 + y: 281.3910724383104 + selected: false + sourcePosition: right + targetPosition: left + type: custom + width: 242 + viewport: + x: -826.1791044466438 + y: -71.91725474841303 + zoom: 0.9980166672552107 + rag_pipeline_variables: + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: jina_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: jina_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: jina_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752491761974' + default_value: null + label: Use sitemap + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl + iteratively based on page relevance, yielding fewer but higher-quality pages. + type: checkbox + unit: null + variable: jina_use_sitemap + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: URL + max_length: 256 + options: [] + placeholder: https://docs.dify.ai/en/ + required: true + tooltips: null + type: text-input + unit: null + variable: firecrawl_url + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: true + label: Crawl sub-pages + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_crawl_sub_pages + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: 10 + label: Limit + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: null + variable: firecrawl_limit + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Max depth + max_length: 48 + options: [] + placeholder: '' + required: false + tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes + the page of the entered url, depth 1 scrapes the url and everything after enteredURL + + one /, and so on. + type: number + unit: null + variable: firecrawl_max_depth + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Exclude paths + max_length: 256 + options: [] + placeholder: blog/*, /about/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_exclude_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: Include only paths + max_length: 256 + options: [] + placeholder: articles/* + required: false + tooltips: null + type: text-input + unit: null + variable: firecrawl_include_only_paths + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: '1752565402678' + default_value: null + label: firecrawl_extract_main_content + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: firecrawl_extract_main_content + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n\n + label: delimiter + max_length: 100 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 1024 + label: Maximum chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: \n + label: Child delimiter + max_length: 199 + options: [] + placeholder: null + required: true + tooltips: A delimiter is the character used to separate text. \n\n is recommended + for splitting the original document into large parent chunks. You can also use + special delimiters defined by yourself. + type: text-input + unit: null + variable: child_delimiter + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: 512 + label: Child max chunk length + max_length: 48 + options: [] + placeholder: null + required: true + tooltips: null + type: number + unit: characters + variable: child_max_chunk_length + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: paragraph + label: Parent mode + max_length: 48 + options: + - full_doc + - paragraph + placeholder: null + required: true + tooltips: null + type: select + unit: null + variable: parent_mode + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Replace consecutive spaces, newlines and tabs + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: replace_consecutive_spaces + - allow_file_extension: null + allow_file_upload_methods: null + allowed_file_types: null + belong_to_node_id: shared + default_value: null + label: Delete all URLs and email addresses + max_length: 48 + options: [] + placeholder: null + required: false + tooltips: null + type: checkbox + unit: null + variable: delete_urls_email