diff --git a/api/.env.example b/api/.env.example index af95a4fe2d..502461f658 100644 --- a/api/.env.example +++ b/api/.env.example @@ -165,6 +165,7 @@ MILVUS_URI=http://127.0.0.1:19530 MILVUS_TOKEN= MILVUS_USER=root MILVUS_PASSWORD=Milvus +MILVUS_ANALYZER_PARAMS= # MyScale configuration MYSCALE_HOST=127.0.0.1 diff --git a/api/configs/middleware/vdb/milvus_config.py b/api/configs/middleware/vdb/milvus_config.py index ebdf8857b9..d398ef5bd8 100644 --- a/api/configs/middleware/vdb/milvus_config.py +++ b/api/configs/middleware/vdb/milvus_config.py @@ -39,3 +39,8 @@ class MilvusConfig(BaseSettings): "older versions", default=True, ) + + MILVUS_ANALYZER_PARAMS: Optional[str] = Field( + description='Milvus text analyzer parameters, e.g., {"type": "chinese"} for Chinese segmentation support.', + default=None, + ) diff --git a/api/core/rag/datasource/vdb/milvus/milvus_vector.py b/api/core/rag/datasource/vdb/milvus/milvus_vector.py index 7a3319f4a6..100bcb198c 100644 --- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py +++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py @@ -32,6 +32,7 @@ class MilvusConfig(BaseModel): batch_size: int = 100 # Batch size for operations database: str = "default" # Database name enable_hybrid_search: bool = False # Flag to enable hybrid search + analyzer_params: Optional[str] = None # Analyzer params @model_validator(mode="before") @classmethod @@ -58,6 +59,7 @@ class MilvusConfig(BaseModel): "user": self.user, "password": self.password, "db_name": self.database, + "analyzer_params": self.analyzer_params, } @@ -300,14 +302,19 @@ class MilvusVector(BaseVector): # Create the text field, enable_analyzer will be set True to support milvus automatically # transfer text to sparse_vector, reference: https://milvus.io/docs/full-text-search.md - fields.append( - FieldSchema( - Field.CONTENT_KEY.value, - DataType.VARCHAR, - max_length=65_535, - enable_analyzer=self._hybrid_search_enabled, - ) - ) + content_field_kwargs: dict[str, Any] = { + "max_length": 65_535, + "enable_analyzer": self._hybrid_search_enabled, + } + if ( + self._hybrid_search_enabled + and self._client_config.analyzer_params is not None + and self._client_config.analyzer_params.strip() + ): + content_field_kwargs["analyzer_params"] = self._client_config.analyzer_params + + fields.append(FieldSchema(Field.CONTENT_KEY.value, DataType.VARCHAR, **content_field_kwargs)) + # Create the primary key field fields.append(FieldSchema(Field.PRIMARY_KEY.value, DataType.INT64, is_primary=True, auto_id=True)) # Create the vector field, supports binary or float vectors @@ -383,5 +390,6 @@ class MilvusVectorFactory(AbstractVectorFactory): password=dify_config.MILVUS_PASSWORD or "", database=dify_config.MILVUS_DATABASE or "", enable_hybrid_search=dify_config.MILVUS_ENABLE_HYBRID_SEARCH or False, + analyzer_params=dify_config.MILVUS_ANALYZER_PARAMS or "", ), ) diff --git a/docker/.env.example b/docker/.env.example index e49e8fee89..9b372dcec9 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -410,6 +410,7 @@ MILVUS_TOKEN= MILVUS_USER= MILVUS_PASSWORD= MILVUS_ENABLE_HYBRID_SEARCH=False +MILVUS_ANALYZER_PARAMS= # MyScale configuration, only available when VECTOR_STORE is `myscale` # For multi-language support, please set MYSCALE_FTS_PARAMS with referring to: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 25b0c56561..172cbe2d2f 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -142,6 +142,7 @@ x-shared-env: &shared-api-worker-env MILVUS_USER: ${MILVUS_USER:-} MILVUS_PASSWORD: ${MILVUS_PASSWORD:-} MILVUS_ENABLE_HYBRID_SEARCH: ${MILVUS_ENABLE_HYBRID_SEARCH:-False} + MILVUS_ANALYZER_PARAMS: ${MILVUS_ANALYZER_PARAMS:-} MYSCALE_HOST: ${MYSCALE_HOST:-myscale} MYSCALE_PORT: ${MYSCALE_PORT:-8123} MYSCALE_USER: ${MYSCALE_USER:-default}