dify/api/tasks/batch_create_segment_to_index_task.py

import datetime
import logging
import time
import uuid

import click
from celery import shared_task  # type: ignore
from sqlalchemy import func

from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs import helper
from models.dataset import Dataset, Document, DocumentSegment
from services.vector_service import VectorService


@shared_task(queue="dataset")
def batch_create_segment_to_index_task(
    job_id: str, content: list, dataset_id: str, document_id: str, tenant_id: str, user_id: str
):
    """
    Async batch create segment to index
    :param job_id:
    :param content:
    :param dataset_id:
    :param document_id:
    :param tenant_id:
    :param user_id:

    Usage: batch_create_segment_to_index_task.delay(segment_id)
    """
    logging.info(click.style("Start batch create segment jobId: {}".format(job_id), fg="green"))
    start_at = time.perf_counter()

    indexing_cache_key = "segment_batch_import_{}".format(job_id)

    try:
        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
        if not dataset:
            raise ValueError("Dataset not exist.")

        dataset_document = db.session.query(Document).filter(Document.id == document_id).first()
        if not dataset_document:
            raise ValueError("Document not exist.")

        if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":
            raise ValueError("Document is not available.")
        document_segments = []
        embedding_model = None
        if dataset.indexing_technique == "high_quality":
            model_manager = ModelManager()
            embedding_model = model_manager.get_model_instance(
                tenant_id=dataset.tenant_id,
                provider=dataset.embedding_model_provider,
                model_type=ModelType.TEXT_EMBEDDING,
                model=dataset.embedding_model,
            )
        word_count_change = 0
        segments_to_insert: list[str] = []  # Explicitly type hint the list as List[str]
        for segment in content:
            content_str = segment["content"]
            doc_id = str(uuid.uuid4())
            segment_hash = helper.generate_text_hash(content_str)
            # calc embedding use tokens
            tokens = embedding_model.get_text_embedding_num_tokens(texts=[content_str]) if embedding_model else 0
            max_position = (
                db.session.query(func.max(DocumentSegment.position))
                .filter(DocumentSegment.document_id == dataset_document.id)
                .scalar()
            )
            segment_document = DocumentSegment(
                tenant_id=tenant_id,
                dataset_id=dataset_id,
                document_id=document_id,
                index_node_id=doc_id,
                index_node_hash=segment_hash,
                position=max_position + 1 if max_position else 1,
                content=content_str,
                word_count=len(content_str),
                tokens=tokens,
                created_by=user_id,
                indexing_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
                status="completed",
                completed_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
            )
            if dataset_document.doc_form == "qa_model":
                segment_document.answer = segment["answer"]
                segment_document.word_count += len(segment["answer"])
            word_count_change += segment_document.word_count
            db.session.add(segment_document)
            document_segments.append(segment_document)
            segments_to_insert.append(str(segment))  # Cast to string if needed
        # update document word count
        dataset_document.word_count += word_count_change
        db.session.add(dataset_document)
        # add index to db
        VectorService.create_segments_vector(None, document_segments, dataset, dataset_document.doc_form)
        db.session.commit()
        redis_client.setex(indexing_cache_key, 600, "completed")
        end_at = time.perf_counter()
        logging.info(
            click.style("Segment batch created job: {} latency: {}".format(job_id, end_at - start_at), fg="green")
        )
    except Exception as e:
        logging.exception("Segments batch created index failed")
        redis_client.setex(indexing_cache_key, 600, "error")
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`import datetime`
			`import logging`
			`import time`
			`import uuid`

			`import click`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`from celery import shared_task # type: ignore`
enhancement: introduce Ruff for Python linter for reordering and removing unused imports with automated pre-commit and sytle check (#2366) 2024-02-06 13:21:13 +08:00			`from sqlalchemy import func`

Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`from core.model_manager import ModelManager`
			`from core.model_runtime.entities.model_entities import ModelType`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`from extensions.ext_database import db`
			`from extensions.ext_redis import redis_client`
			`from libs import helper`
improve: introduce isort for linting Python imports (#1983) 2024-01-12 12:34:01 +08:00			`from models.dataset import Dataset, Document, DocumentSegment`
Feat/support parent child chunk (#12092) 2024-12-25 19:49:07 +08:00			`from services.vector_service import VectorService`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00

chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`@shared_task(queue="dataset")`
			`def batch_create_segment_to_index_task(`
			`job_id: str, content: list, dataset_id: str, document_id: str, tenant_id: str, user_id: str`
			`):`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`"""`
			`Async batch create segment to index`
			`:param job_id:`
			`:param content:`
			`:param dataset_id:`
			`:param document_id:`
			`:param tenant_id:`
			`:param user_id:`

			`Usage: batch_create_segment_to_index_task.delay(segment_id)`
			`"""`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`logging.info(click.style("Start batch create segment jobId: {}".format(job_id), fg="green"))`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`start_at = time.perf_counter()`

chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`indexing_cache_key = "segment_batch_import_{}".format(job_id)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00
			`try:`
			`dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()`
			`if not dataset:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`raise ValueError("Dataset not exist.")`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00
			`dataset_document = db.session.query(Document).filter(Document.id == document_id).first()`
			`if not dataset_document:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`raise ValueError("Document not exist.")`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed":`
			`raise ValueError("Document is not available.")`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`document_segments = []`
Fix/ignore economy dataset (#1043) Co-authored-by: jyong <jyong@dify.ai> 2023-08-29 03:37:45 +08:00			`embedding_model = None`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`if dataset.indexing_technique == "high_quality":`
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`model_manager = ModelManager()`
			`embedding_model = model_manager.get_model_instance(`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`tenant_id=dataset.tenant_id,`
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`provider=dataset.embedding_model_provider,`
			`model_type=ModelType.TEXT_EMBEDDING,`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`model=dataset.embedding_model,`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`)`
update document and segment word count (#10449) 2024-11-08 17:32:27 +08:00			`word_count_change = 0`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`segments_to_insert: list[str] = [] # Explicitly type hint the list as List[str]`
Fix/ignore economy dataset (#1043) Co-authored-by: jyong <jyong@dify.ai> 2023-08-29 03:37:45 +08:00			`for segment in content:`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`content_str = segment["content"]`
Fix/ignore economy dataset (#1043) Co-authored-by: jyong <jyong@dify.ai> 2023-08-29 03:37:45 +08:00			`doc_id = str(uuid.uuid4())`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`segment_hash = helper.generate_text_hash(content_str)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`# calc embedding use tokens`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`tokens = embedding_model.get_text_embedding_num_tokens(texts=[content_str]) if embedding_model else 0`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`max_position = (`
			`db.session.query(func.max(DocumentSegment.position))`
			`.filter(DocumentSegment.document_id == dataset_document.id)`
			`.scalar()`
			`)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`segment_document = DocumentSegment(`
			`tenant_id=tenant_id,`
			`dataset_id=dataset_id,`
			`document_id=document_id,`
			`index_node_id=doc_id,`
			`index_node_hash=segment_hash,`
			`position=max_position + 1 if max_position else 1,`
Fix pandas indexing method for knowledge base imports (#12637) (#12638) Co-authored-by: CN-P5 <heibai2006@qq.com> 2025-01-13 09:06:59 +08:00			`content=content_str,`
			`word_count=len(content_str),`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`tokens=tokens,`
			`created_by=user_id,`
chore: bump minimum supported Python version to 3.11 (#10386) 2024-11-24 13:28:46 +08:00			`indexing_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`status="completed",`
chore: bump minimum supported Python version to 3.11 (#10386) 2024-11-24 13:28:46 +08:00			`completed_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`)`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`if dataset_document.doc_form == "qa_model":`
			`segment_document.answer = segment["answer"]`
update document and segment word count (#10449) 2024-11-08 17:32:27 +08:00			`segment_document.word_count += len(segment["answer"])`
			`word_count_change += segment_document.word_count`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`db.session.add(segment_document)`
			`document_segments.append(segment_document)`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`segments_to_insert.append(str(segment)) # Cast to string if needed`
update document and segment word count (#10449) 2024-11-08 17:32:27 +08:00			`# update document word count`
			`dataset_document.word_count += word_count_change`
			`db.session.add(dataset_document)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`# add index to db`
Feat/support parent child chunk (#12092) 2024-12-25 19:49:07 +08:00			`VectorService.create_segments_vector(None, document_segments, dataset, dataset_document.doc_form)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`db.session.commit()`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`redis_client.setex(indexing_cache_key, 600, "completed")`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`end_at = time.perf_counter()`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`logging.info(`
			`click.style("Segment batch created job: {} latency: {}".format(job_id, end_at - start_at), fg="green")`
			`)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 17:37:31 +08:00			`except Exception as e:`
chore(lint): cleanup repeated cause exception in logging.exception replaced by helpful message (#10425) 2024-11-15 15:41:40 +08:00			`logging.exception("Segments batch created index failed")`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`redis_client.setex(indexing_cache_key, 600, "error")`