r2 transform

2025-07-31 05:18:40 +00:00 · 2025-07-16 01:50:37 +08:00 · 2025-07-16 01:50:37 +08:00 · 2012ea3213
commit 2012ea3213
parent 1ad73ccdc8
23 changed files with 5867 additions and 5 deletions
--- a/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py
+++ b/api/controllers/console/datasets/rag_pipeline/rag_pipeline_workflow.py
@ -46,6 +46,7 @@ from services.errors.llm import InvokeRateLimitError
 from services.rag_pipeline.pipeline_generate_service import PipelineGenerateService
 from services.rag_pipeline.rag_pipeline import RagPipelineService
 from services.rag_pipeline.rag_pipeline_manage_service import RagPipelineManageService
+from services.rag_pipeline.rag_pipeline_transform_service import RagPipelineTransformService

 logger = logging.getLogger(__name__)

@ -946,6 +947,16 @@ class RagPipelineWorkflowLastRunApi(Resource):
        if node_exec is None:
            raise NotFound("last run not found")
        return node_exec
+    
+class RagPipelineTransformApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self, dataset_id):
+        dataset_id = str(dataset_id)
+        rag_pipeline_transform_service = RagPipelineTransformService()
+        rag_pipeline_transform_service.transform_dataset(dataset_id)
+        return {"message": "success"}


 api.add_resource(
@ -1056,3 +1067,7 @@ api.add_resource(
    RagPipelineWorkflowLastRunApi,
    "/rag/pipelines/<uuid:pipeline_id>/workflows/draft/nodes/<string:node_id>/last-run",
 )
+api.add_resource(
+    RagPipelineTransformApi,
+    "/rag/pipelines/transform/datasets/<uuid:dataset_id>",
+)
--- a/api/services/entities/knowledge_entities/rag_pipeline_entities.py
+++ b/api/services/entities/knowledge_entities/rag_pipeline_entities.py
@ -87,8 +87,8 @@ class RetrievalSetting(BaseModel):
    top_k: int
    score_threshold: Optional[float] = 0.5
    score_threshold_enabled: bool = False
-    reranking_mode: str = "reranking_model"
-    reranking_enable: bool = True
+    reranking_mode: Optional[str] = "reranking_model"
+    reranking_enable: Optional[bool] = True
    reranking_model: Optional[RerankingModelConfig] = None
    weights: Optional[WeightedScoreConfig] = None

--- a/api/services/rag_pipeline/rag_pipeline.py
+++ b/api/services/rag_pipeline/rag_pipeline.py
@ -54,7 +54,7 @@ from core.workflow.workflow_entry import WorkflowEntry
 from extensions.ext_database import db
 from libs.infinite_scroll_pagination import InfiniteScrollPagination
 from models.account import Account
-from models.dataset import Document, Pipeline, PipelineCustomizedTemplate  # type: ignore
+from models.dataset import Dataset, Document, Pipeline, PipelineCustomizedTemplate  # type: ignore
 from models.enums import WorkflowRunTriggeredFrom
 from models.model import EndUser
 from models.workflow import (
@ -72,7 +72,7 @@ from services.entities.knowledge_entities.rag_pipeline_entities import (
 )
 from services.errors.app import WorkflowHashNotEqualError
 from services.rag_pipeline.pipeline_template.pipeline_template_factory import PipelineTemplateRetrievalFactory
-from services.workflow_draft_variable_service import DraftVarLoader, DraftVariableSaver
+from services.workflow_draft_variable_service import DraftVariableSaver, DraftVarLoader

 logger = logging.getLogger(__name__)

--- a/api/services/rag_pipeline/rag_pipeline_dsl_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_dsl_service.py
@ -233,6 +233,7 @@ class RagPipelineDslService:
                    )
                dataset = pipeline.dataset
                if dataset:
+                    self._session.merge(dataset)
                    dataset_name = dataset.name

            # If major version mismatch, store import info in Redis
@ -291,7 +292,7 @@ class RagPipelineDslService:
                    if not dataset:
                        dataset = Dataset(
                            tenant_id=account.current_tenant_id,
-                            name=name,
+                            name=name + datetime.now(UTC).strftime("%Y%m%d%H%M%S%f"),
                            description=description,
                            icon_info={
                                "type": icon_type,
--- a/api/services/rag_pipeline/rag_pipeline_transform_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py
@ -0,0 +1,210 @@
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from uuid import uuid4
+
+import yaml
+from flask_login import current_user
+
+from constants import DOCUMENT_EXTENSIONS
+from extensions.ext_database import db
+from factories import variable_factory
+from models.dataset import Dataset, Pipeline
+from models.workflow import Workflow, WorkflowType
+from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
+
+
+class RagPipelineTransformService:
+
+
+    def transform_dataset(self, dataset_id: str):
+        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
+        if not dataset:
+            raise ValueError("Dataset not found")
+        if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
+            return
+        if dataset.provider != "vendor":
+            raise ValueError("External dataset is not supported")
+        datasource_type = dataset.data_source_type
+        indexing_technique = dataset.indexing_technique
+
+        if not datasource_type and not indexing_technique:
+            return
+        doc_form = dataset.doc_form
+        if not doc_form:
+            return
+        retrieval_model = dataset.retrieval_model
+        pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
+        # Extract app data
+        workflow_data = pipeline_yaml.get("workflow")
+        graph = workflow_data.get("graph", {})
+        nodes = graph.get("nodes", [])
+        new_nodes = []
+
+        for node in nodes:
+            if node.get("data", {}).get("type") == "datasource" and node.get("data", {}).get("provider_type") == "local_file":
+                node = self._deal_file_extensions(node)
+            if node.get("data", {}).get("type") == "knowledge-index":
+                node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
+            new_nodes.append(node)
+        if new_nodes:
+            graph["nodes"] = new_nodes
+            workflow_data["graph"] = graph
+            pipeline_yaml["workflow"] = workflow_data
+        # create pipeline
+        pipeline = self._create_pipeline(pipeline_yaml)
+
+        # save chunk structure to dataset
+        if doc_form == "hierarchical_model":
+            dataset.chunk_structure = "hierarchical_model"
+        elif doc_form == "text_model":
+            dataset.chunk_structure = "text_model"
+        else:
+            raise ValueError("Unsupported doc form")
+
+        dataset.runtime_mode = "rag_pipeline"
+        dataset.pipeline_id = pipeline.id
+
+        db.session.commit()
+
+    def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str):
+        if doc_form == "text_model":
+            match datasource_type:
+                case "upload_file":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.file-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.file-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-economy.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.notion-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.notion-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.website-crawl-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.website-crawl-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml", "r") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        elif doc_form == "hierarchical_model":
+            match datasource_type:
+                case "upload_file":
+                    # get graph from transform.file-parent-child.yml
+                    with open(f"{Path(__file__).parent}/transform/file-parent-child.yml", "r") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    # get graph from transform.notion-parent-child.yml
+                    with open(f"{Path(__file__).parent}/transform/notion-parent-child.yml", "r") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    # get graph from transform.website-crawl-parent-child.yml
+                    with open(f"{Path(__file__).parent}/transform/website-crawl-parent-child.yml", "r") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        else:
+            raise ValueError("Unsupported doc form")
+        return pipeline_yaml
+
+    def _deal_file_extensions(self, node: dict):
+        file_extensions = node.get("data", {}).get("fileExtensions", [])
+        if not file_extensions:
+            return node
+        file_extensions = [file_extension.lower() for file_extension in file_extensions]
+        node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
+        return node
+
+    def _deal_knowledge_index(self, dataset: Dataset, doc_form: str, indexing_technique: str, retrieval_model: dict, node: dict):
+        knowledge_configuration = node.get("data", {})
+        knowledge_configuration = KnowledgeConfiguration(**knowledge_configuration)
+
+        if indexing_technique == "high_quality":
+            knowledge_configuration.embedding_model = dataset.embedding_model
+            knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
+        retrieval_setting = RetrievalSetting(**retrieval_model)
+        if indexing_technique == "economy":
+            retrieval_setting.search_method = "keyword_search"
+        knowledge_configuration.retrieval_model = retrieval_setting
+
+        return knowledge_configuration.model_dump()
+
+    def _create_pipeline(
+        self,
+        data: dict,
+    ) -> Pipeline:
+        """Create a new app or update an existing one."""
+        pipeline_data = data.get("rag_pipeline", {})
+        # Initialize pipeline based on mode
+        workflow_data = data.get("workflow")
+        if not workflow_data or not isinstance(workflow_data, dict):
+            raise ValueError("Missing workflow data for rag pipeline")
+
+        environment_variables_list = workflow_data.get("environment_variables", [])
+        environment_variables = [
+            variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
+        ]
+        conversation_variables_list = workflow_data.get("conversation_variables", [])
+        conversation_variables = [
+            variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
+        ]
+        rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
+
+        graph = workflow_data.get("graph", {})
+
+        # Create new app
+        pipeline = Pipeline()
+        pipeline.id = str(uuid4())
+        pipeline.tenant_id = current_user.current_tenant_id
+        pipeline.name = pipeline_data.get("name", "")
+        pipeline.description = pipeline_data.get("description", "")
+        pipeline.created_by = current_user.id
+        pipeline.updated_by = current_user.id
+        pipeline.is_published = True
+        pipeline.is_public = True
+
+        db.session.add(pipeline)
+        db.session.flush()
+        # create draft workflow
+        draft_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE.value,
+            version="draft",
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        published_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE.value,
+            version=str(datetime.now(UTC).replace(tzinfo=None)),
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        db.session.add(draft_workflow)
+        db.session.add(published_workflow)
+        db.session.flush()
+        pipeline.workflow_id = published_workflow.id
+        db.session.add(pipeline)
+        return pipeline
--- a/api/services/rag_pipeline/transform/file-general-economy.yml
+++ b/api/services/rag_pipeline/transform/file-general-economy.yml
@ -0,0 +1,708 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: file-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: file
+            pt_BR: file
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify文本提取器
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify文本提取器
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: 条件分支
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: null
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/file-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/file-general-high-quality.yml
@ -0,0 +1,708 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: file
+            pt_BR: file
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify文本提取器
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify文本提取器
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: 条件分支
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: null
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/file-parentchild.yml
+++ b/api/services/rag_pipeline/transform/file-parentchild.yml
@ -0,0 +1,816 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.4@0cb3f06230a377c4c037fa7b5e21f4f4e362e5f24a59ed7bf4950ff75e6f1e61
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752575473519-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752575473519'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752575473519-source-1752477924228-target
+      source: '1752575473519'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752575473519'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: file
+            pt_BR: file
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify文本提取器
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify文本提取器
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: 条件分支
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: Input text
+            pt_BR: Input text
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: Maximum length for chunking
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: Maximum Length
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: Separator used for chunking
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: Chunk Separator
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: Maximum length for subchunking
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: Subchunk Maximum Length
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: Separator used for subchunking
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: Subchunk Separator
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: Parent Mode
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: Paragraph
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: Full Document
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: Whether to remove extra spaces in the text
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: Remove Extra Spaces
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: Whether to remove URLs and emails in the text
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: Remove URLs and Emails
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parent_child_chunk/parent_child_chunk
+        provider_name: langgenius/parent_child_chunk/parent_child_chunk
+        provider_type: builtin
+        selected: false
+        title: 父子分块处理器
+        tool_configurations: {}
+        tool_description: 将文档处理为父子分块结构
+        tool_label: 父子分块处理器
+        tool_name: parent_child_chunk
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752575473519'
+      position:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 948.6766333808323
+      y: -102.06757184183238
+      zoom: 0.8375774577380971
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/file_general_economy.json
+++ b/api/services/rag_pipeline/transform/file_general_economy.json
--- a/api/services/rag_pipeline/transform/file_general_high_quality.json
+++ b/api/services/rag_pipeline/transform/file_general_high_quality.json
--- a/api/services/rag_pipeline/transform/file_parent_child.json
+++ b/api/services/rag_pipeline/transform/file_parent_child.json
--- a/api/services/rag_pipeline/transform/notion-general-economy.yml
+++ b/api/services/rag_pipeline/transform/notion-general-economy.yml
@ -0,0 +1,394 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: null
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/notion-general-high-quality.yml
@ -0,0 +1,394 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: notion-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: null
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion-parentchild.yml
+++ b/api/services/rag_pipeline/transform/notion-parentchild.yml
@ -0,0 +1,503 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752490343805-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: Input text
+            pt_BR: Input text
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: Maximum length for chunking
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: Maximum Length
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: Separator used for chunking
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: Chunk Separator
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: Maximum length for subchunking
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: Subchunk Maximum Length
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: Separator used for subchunking
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: Subchunk Separator
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: Parent Mode
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: Paragraph
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: Full Document
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: Whether to remove extra spaces in the text
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: Remove Extra Spaces
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: Whether to remove URLs and emails in the text
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: Remove URLs and Emails
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parent_child_chunk/parent_child_chunk
+        provider_name: langgenius/parent_child_chunk/parent_child_chunk
+        provider_type: builtin
+        selected: true
+        title: 父子分块处理器
+        tool_configurations: {}
+        tool_description: 将文档处理为父子分块结构
+        tool_label: 父子分块处理器
+        tool_name: parent_child_chunk
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -487.2912544090391
+      y: -54.7029301848807
+      zoom: 0.9994011715768695
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion_general_economy.json
+++ b/api/services/rag_pipeline/transform/notion_general_economy.json
--- a/api/services/rag_pipeline/transform/notion_general_high_quality.json
+++ b/api/services/rag_pipeline/transform/notion_general_high_quality.json
--- a/api/services/rag_pipeline/transform/notion_parent_child.json
+++ b/api/services/rag_pipeline/transform/notion_parent_child.json
--- a/api/services/rag_pipeline/transform/web_crawl_general_economy.json
+++ b/api/services/rag_pipeline/transform/web_crawl_general_economy.json
--- a/api/services/rag_pipeline/transform/web_crawl_general_high_quality.json
+++ b/api/services/rag_pipeline/transform/web_crawl_general_high_quality.json
--- a/api/services/rag_pipeline/transform/web_crawl_parent_child.json
+++ b/api/services/rag_pipeline/transform/web_crawl_parent_child.json
--- a/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
@ -0,0 +1,666 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
@ -0,0 +1,666 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/general_chunk:0.0.2@0856fa64f7b0dc937c982f12d45b3a1ad91ba8aacc0d28a1b436e6c94a77e298
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: website-crawl-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: Input Variable
+            pt_BR: Input Variable
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: The delimiter of the chunks.
+            pt_BR: The delimiter of the chunks.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: Delimiter
+            pt_BR: Delimiter
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: The maximum chunk length.
+            pt_BR: The maximum chunk length.
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: Maximum Chunk Length
+            pt_BR: Maximum Chunk Length
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: The chunk overlap length.
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: Chunk Overlap Length
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: Replace consecutive spaces, newlines and tabs
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: Replace Consecutive Spaces, Newlines and Tabs
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: Delete all URLs and email addresses
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: Delete All URLs and Email Addresses
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunk/general_chunk
+        provider_name: langgenius/general_chunk/general_chunk
+        provider_type: builtin
+        selected: false
+        title: 通用文本分块
+        tool_configurations: {}
+        tool_description: 一个用于通用文本分块模式的工具，检索和召回的块是相同的。
+        tool_label: 通用文本分块
+        tool_name: general_chunk
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
@ -0,0 +1,772 @@
+dependencies:
+- current_identifier: null
+  type: package
+  value:
+    plugin_unique_identifier: langgenius/parent_child_chunk:0.0.1@f8f9ba1f3bcda159ebc0168baa755c2181b923da8157ebb439b8046019f5b510
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752490343805-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: 知识库
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: The text you want to chunk.
+            pt_BR: The text you want to chunk.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: Input text
+            pt_BR: Input text
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: Maximum length for chunking
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: Maximum Length
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: Separator used for chunking
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: Chunk Separator
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: Maximum length for subchunking
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: Subchunk Maximum Length
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: Separator used for subchunking
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: Subchunk Separator
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: Parent Mode
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: Paragraph
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: Full Document
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: Whether to remove extra spaces in the text
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: Remove Extra Spaces
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: Whether to remove URLs and emails in the text
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: Remove URLs and Emails
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parent_child_chunk/parent_child_chunk
+        provider_name: langgenius/parent_child_chunk/parent_child_chunk
+        provider_type: builtin
+        selected: true
+        title: 父子分块处理器
+        tool_configurations: {}
+        tool_description: 将文档处理为父子分块结构
+        tool_label: 父子分块处理器
+        tool_name: parent_child_chunk
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: 变量聚合器
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -826.1791044466438
+      y: -71.91725474841303
+      zoom: 0.9980166672552107
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email