feat:support selecting different ftparser for OceanBase. (#25970)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
longbingljw 2025-09-22 09:56:33 +08:00 committed by GitHub
parent 92cddbcc02
commit 208fe3d7de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 44 additions and 15 deletions

View File

@ -37,3 +37,11 @@ class OceanBaseVectorConfig(BaseSettings):
"with older versions", "with older versions",
default=False, default=False,
) )
OCEANBASE_FULLTEXT_PARSER: str | None = Field(
description=(
"Fulltext parser to use for text indexing. Options: 'japanese_ftparser' (Japanese), "
"'thai_ftparser' (Thai), 'ik' (Chinese). Default is 'ik'"
),
default="ik",
)

View File

@ -4,7 +4,7 @@ import math
from typing import Any from typing import Any
from pydantic import BaseModel, model_validator from pydantic import BaseModel, model_validator
from pyobvector import VECTOR, FtsIndexParam, FtsParser, ObVecClient, l2_distance # type: ignore from pyobvector import VECTOR, ObVecClient, l2_distance # type: ignore
from sqlalchemy import JSON, Column, String from sqlalchemy import JSON, Column, String
from sqlalchemy.dialects.mysql import LONGTEXT from sqlalchemy.dialects.mysql import LONGTEXT
@ -117,22 +117,39 @@ class OceanBaseVector(BaseVector):
columns=cols, columns=cols,
vidxs=vidx_params, vidxs=vidx_params,
) )
try: logger.debug("DEBUG: Table '%s' created successfully", self._collection_name)
if self._hybrid_search_enabled: if self._hybrid_search_enabled:
self._client.create_fts_idx_with_fts_index_param( # Get parser from config or use default ik parser
table_name=self._collection_name, parser_name = dify_config.OCEANBASE_FULLTEXT_PARSER or "ik"
fts_idx_param=FtsIndexParam(
index_name="fulltext_index_for_col_text", allowed_parsers = ["ik", "japanese_ftparser", "thai_ftparser"]
field_names=["text"], if parser_name not in allowed_parsers:
parser_type=FtsParser.IK, raise ValueError(
), f"Invalid OceanBase full-text parser: {parser_name}. "
f"Allowed values are: {', '.join(allowed_parsers)}"
) )
logger.debug("Hybrid search is enabled, parser_name='%s'", parser_name)
logger.debug(
"About to create fulltext index for collection '%s' using parser '%s'",
self._collection_name,
parser_name,
)
try:
sql_command = f"""ALTER TABLE {self._collection_name}
ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER {parser_name}"""
logger.debug("DEBUG: Executing SQL: %s", sql_command)
self._client.perform_raw_text_sql(sql_command)
logger.debug("DEBUG: Fulltext index created successfully for '%s'", self._collection_name)
except Exception as e: except Exception as e:
logger.exception("Exception occurred while creating fulltext index")
raise Exception( raise Exception(
"Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above " "Failed to add fulltext index to the target table, your OceanBase version must be "
+ "to support fulltext index and vector index in the same table", "4.3.5.1 or above to support fulltext index and vector index in the same table"
e, ) from e
) else:
logger.debug("DEBUG: Hybrid search is NOT enabled for '%s'", self._collection_name)
self._client.refresh_metadata([self._collection_name]) self._client.refresh_metadata([self._collection_name])
redis_client.set(collection_exist_cache_key, 1, ex=3600) redis_client.set(collection_exist_cache_key, 1, ex=3600)

View File

@ -661,6 +661,7 @@ OCEANBASE_VECTOR_DATABASE=test
OCEANBASE_CLUSTER_NAME=difyai OCEANBASE_CLUSTER_NAME=difyai
OCEANBASE_MEMORY_LIMIT=6G OCEANBASE_MEMORY_LIMIT=6G
OCEANBASE_ENABLE_HYBRID_SEARCH=false OCEANBASE_ENABLE_HYBRID_SEARCH=false
OCEANBASE_FULLTEXT_PARSER=ik
# opengauss configurations, only available when VECTOR_STORE is `opengauss` # opengauss configurations, only available when VECTOR_STORE is `opengauss`
OPENGAUSS_HOST=opengauss OPENGAUSS_HOST=opengauss

View File

@ -504,6 +504,7 @@ services:
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
OB_SERVER_IP: 127.0.0.1 OB_SERVER_IP: 127.0.0.1
MODE: mini MODE: mini
LANG: en_US.UTF-8
ports: ports:
- "${OCEANBASE_VECTOR_PORT:-2881}:2881" - "${OCEANBASE_VECTOR_PORT:-2881}:2881"
healthcheck: healthcheck:

View File

@ -306,6 +306,7 @@ x-shared-env: &shared-api-worker-env
OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G}
OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false} OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false}
OCEANBASE_FULLTEXT_PARSER: ${OCEANBASE_FULLTEXT_PARSER:-ik}
OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss} OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss}
OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600} OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600}
OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres} OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres}
@ -1092,6 +1093,7 @@ services:
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
OB_SERVER_IP: 127.0.0.1 OB_SERVER_IP: 127.0.0.1
MODE: mini MODE: mini
LANG: en_US.UTF-8
ports: ports:
- "${OCEANBASE_VECTOR_PORT:-2881}:2881" - "${OCEANBASE_VECTOR_PORT:-2881}:2881"
healthcheck: healthcheck: