dify/api/tasks/duplicate_document_indexing_task.py

import datetime
import logging
import time

import click
from celery import shared_task  # type: ignore

from configs import dify_config
from core.indexing_runner import DocumentIsPausedError, IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
from services.feature_service import FeatureService


@shared_task(queue="dataset")
def duplicate_document_indexing_task(dataset_id: str, document_ids: list):
    """
    Async process document
    :param dataset_id:
    :param document_ids:

    Usage: duplicate_document_indexing_task.delay(dataset_id, document_ids)
    """
    documents = []
    start_at = time.perf_counter()

    dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
    if dataset is None:
        logging.info(click.style("Dataset not found: {}".format(dataset_id), fg="red"))
        db.session.close()
        return

    # check document limit
    features = FeatureService.get_features(dataset.tenant_id)
    try:
        if features.billing.enabled:
            vector_space = features.vector_space
            count = len(document_ids)
            if features.billing.subscription.plan == "sandbox" and count > 1:
                raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")
            batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
            if count > batch_upload_limit:
                raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
            if 0 < vector_space.limit <= vector_space.size:
                raise ValueError(
                    "Your total number of documents plus the number of uploads have over the limit of "
                    "your subscription."
                )
    except Exception as e:
        for document_id in document_ids:
            document = (
                db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
            )
            if document:
                document.indexing_status = "error"
                document.error = str(e)
                document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
                db.session.add(document)
        db.session.commit()
        return
    finally:
        db.session.close()

    for document_id in document_ids:
        logging.info(click.style("Start process document: {}".format(document_id), fg="green"))

        document = (
            db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()
        )

        if document:
            # clean old data
            index_type = document.doc_form
            index_processor = IndexProcessorFactory(index_type).init_index_processor()

            segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
            if segments:
                index_node_ids = [segment.index_node_id for segment in segments]

                # delete from vector index
                index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)

                for segment in segments:
                    db.session.delete(segment)
                db.session.commit()

            document.indexing_status = "parsing"
            document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
            documents.append(document)
            db.session.add(document)
    db.session.commit()

    try:
        indexing_runner = IndexingRunner()
        indexing_runner.run(documents)
        end_at = time.perf_counter()
        logging.info(click.style("Processed dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"))
    except DocumentIsPausedError as ex:
        logging.info(click.style(str(ex), fg="yellow"))
    except Exception:
        pass
    finally:
        db.session.close()
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`import datetime`
			`import logging`
			`import time`

			`import click`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`from celery import shared_task # type: ignore`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
refactor(services/tasks): Swtich to dify_config witch Pydantic (#6203) 2024-07-12 12:25:38 +08:00			`from configs import dify_config`
chore: apply pep8-naming rules for naming convention (#8261) 2024-09-11 16:40:52 +08:00			`from core.indexing_runner import DocumentIsPausedError, IndexingRunner`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`from core.rag.index_processor.index_processor_factory import IndexProcessorFactory`
			`from extensions.ext_database import db`
			`from models.dataset import Dataset, Document, DocumentSegment`
			`from services.feature_service import FeatureService`


chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`@shared_task(queue="dataset")`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`def duplicate_document_indexing_task(dataset_id: str, document_ids: list):`
			`"""`
			`Async process document`
			`:param dataset_id:`
			`:param document_ids:`

Chore: fix wrong annotations (#15871) 2025-03-16 11:16:28 +08:00			`Usage: duplicate_document_indexing_task.delay(dataset_id, document_ids)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`"""`
			`documents = []`
			`start_at = time.perf_counter()`

			`dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()`
feat: mypy for all type check (#10921) 2024-12-24 18:38:51 +08:00			`if dataset is None:`
deal db session in celery worker (#17549) 2025-04-07 20:31:26 +08:00			`logging.info(click.style("Dataset not found: {}".format(dataset_id), fg="red"))`
			`db.session.close()`
			`return`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
			`# check document limit`
			`features = FeatureService.get_features(dataset.tenant_id)`
			`try:`
			`if features.billing.enabled:`
			`vector_space = features.vector_space`
			`count = len(document_ids)`
SaaS: batch upload limit check for sandbox plan (#17264) 2025-04-01 16:45:31 +08:00			`if features.billing.subscription.plan == "sandbox" and count > 1:`
			`raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")`
refactor(services/tasks): Swtich to dify_config witch Pydantic (#6203) 2024-07-12 12:25:38 +08:00			`batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`if count > batch_upload_limit:`
			`raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")`
			`if 0 < vector_space.limit <= vector_space.size:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`raise ValueError(`
			`"Your total number of documents plus the number of uploads have over the limit of "`
			`"your subscription."`
			`)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`except Exception as e:`
			`for document_id in document_ids:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`document = (`
			`db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()`
			`)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`if document:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`document.indexing_status = "error"`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`document.error = str(e)`
chore: bump ruff to 0.11.0 and fix linting violations (#15953) 2025-03-17 16:13:11 +08:00			`document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`db.session.add(document)`
			`db.session.commit()`
			`return`
deal db session in celery worker (#17549) 2025-04-07 20:31:26 +08:00			`finally:`
			`db.session.close()`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
			`for document_id in document_ids:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`logging.info(click.style("Start process document: {}".format(document_id), fg="green"))`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`document = (`
			`db.session.query(Document).filter(Document.id == document_id, Document.dataset_id == dataset_id).first()`
			`)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
			`if document:`
			`# clean old data`
			`index_type = document.doc_form`
			`index_processor = IndexProcessorFactory(index_type).init_index_processor()`

			`segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()`
			`if segments:`
			`index_node_ids = [segment.index_node_id for segment in segments]`

			`# delete from vector index`
Feat/support parent child chunk (#12092) 2024-12-25 19:49:07 +08:00			`index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00
			`for segment in segments:`
			`db.session.delete(segment)`
			`db.session.commit()`

chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`document.indexing_status = "parsing"`
chore: bump ruff to 0.11.0 and fix linting violations (#15953) 2025-03-17 16:13:11 +08:00			`document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`documents.append(document)`
			`db.session.add(document)`
			`db.session.commit()`

			`try:`
			`indexing_runner = IndexingRunner()`
			`indexing_runner.run(documents)`
			`end_at = time.perf_counter()`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`logging.info(click.style("Processed dataset: {} latency: {}".format(dataset_id, end_at - start_at), fg="green"))`
chore: apply pep8-naming rules for naming convention (#8261) 2024-09-11 16:40:52 +08:00			`except DocumentIsPausedError as ex:`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 13:38:37 +08:00			`logging.info(click.style(str(ex), fg="yellow"))`
Knowledge optimization (#3755) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: JzoNg <jzongcode@gmail.com> 2024-04-24 15:02:29 +08:00			`except Exception:`
			`pass`
deal db session in celery worker (#17549) 2025-04-07 20:31:26 +08:00			`finally:`
			`db.session.close()`