diff --git a/api/configs/middleware/vdb/oceanbase_config.py b/api/configs/middleware/vdb/oceanbase_config.py index 8437328e76..99f4c49407 100644 --- a/api/configs/middleware/vdb/oceanbase_config.py +++ b/api/configs/middleware/vdb/oceanbase_config.py @@ -37,3 +37,11 @@ class OceanBaseVectorConfig(BaseSettings): "with older versions", default=False, ) + + OCEANBASE_FULLTEXT_PARSER: str | None = Field( + description=( + "Fulltext parser to use for text indexing. Options: 'japanese_ftparser' (Japanese), " + "'thai_ftparser' (Thai), 'ik' (Chinese). Default is 'ik'" + ), + default="ik", + ) diff --git a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py index 8c5d972a79..49cf900126 100644 --- a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py +++ b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py @@ -4,7 +4,7 @@ import math from typing import Any from pydantic import BaseModel, model_validator -from pyobvector import VECTOR, FtsIndexParam, FtsParser, ObVecClient, l2_distance # type: ignore +from pyobvector import VECTOR, ObVecClient, l2_distance # type: ignore from sqlalchemy import JSON, Column, String from sqlalchemy.dialects.mysql import LONGTEXT @@ -117,22 +117,39 @@ class OceanBaseVector(BaseVector): columns=cols, vidxs=vidx_params, ) - try: - if self._hybrid_search_enabled: - self._client.create_fts_idx_with_fts_index_param( - table_name=self._collection_name, - fts_idx_param=FtsIndexParam( - index_name="fulltext_index_for_col_text", - field_names=["text"], - parser_type=FtsParser.IK, - ), + logger.debug("DEBUG: Table '%s' created successfully", self._collection_name) + + if self._hybrid_search_enabled: + # Get parser from config or use default ik parser + parser_name = dify_config.OCEANBASE_FULLTEXT_PARSER or "ik" + + allowed_parsers = ["ik", "japanese_ftparser", "thai_ftparser"] + if parser_name not in allowed_parsers: + raise ValueError( + f"Invalid OceanBase full-text parser: {parser_name}. " + f"Allowed values are: {', '.join(allowed_parsers)}" ) - except Exception as e: - raise Exception( - "Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above " - + "to support fulltext index and vector index in the same table", - e, + logger.debug("Hybrid search is enabled, parser_name='%s'", parser_name) + logger.debug( + "About to create fulltext index for collection '%s' using parser '%s'", + self._collection_name, + parser_name, ) + try: + sql_command = f"""ALTER TABLE {self._collection_name} + ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER {parser_name}""" + logger.debug("DEBUG: Executing SQL: %s", sql_command) + self._client.perform_raw_text_sql(sql_command) + logger.debug("DEBUG: Fulltext index created successfully for '%s'", self._collection_name) + except Exception as e: + logger.exception("Exception occurred while creating fulltext index") + raise Exception( + "Failed to add fulltext index to the target table, your OceanBase version must be " + "4.3.5.1 or above to support fulltext index and vector index in the same table" + ) from e + else: + logger.debug("DEBUG: Hybrid search is NOT enabled for '%s'", self._collection_name) + self._client.refresh_metadata([self._collection_name]) redis_client.set(collection_exist_cache_key, 1, ex=3600) diff --git a/docker/.env.example b/docker/.env.example index 4575e11b99..af72ce8213 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -661,6 +661,7 @@ OCEANBASE_VECTOR_DATABASE=test OCEANBASE_CLUSTER_NAME=difyai OCEANBASE_MEMORY_LIMIT=6G OCEANBASE_ENABLE_HYBRID_SEARCH=false +OCEANBASE_FULLTEXT_PARSER=ik # opengauss configurations, only available when VECTOR_STORE is `opengauss` OPENGAUSS_HOST=opengauss diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 096bddae0b..93159b056f 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -504,6 +504,7 @@ services: OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OB_SERVER_IP: 127.0.0.1 MODE: mini + LANG: en_US.UTF-8 ports: - "${OCEANBASE_VECTOR_PORT:-2881}:2881" healthcheck: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 1d412d714f..acec6adf10 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -306,6 +306,7 @@ x-shared-env: &shared-api-worker-env OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false} + OCEANBASE_FULLTEXT_PARSER: ${OCEANBASE_FULLTEXT_PARSER:-ik} OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss} OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600} OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres} @@ -1092,6 +1093,7 @@ services: OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OB_SERVER_IP: 127.0.0.1 MODE: mini + LANG: en_US.UTF-8 ports: - "${OCEANBASE_VECTOR_PORT:-2881}:2881" healthcheck: