fix(search): fix name conflicts in search(mlmodel) (#15415)

2025-12-13 11:06:30 +00:00 · 2025-12-01 18:20:35 +05:30 · 2025-12-01 18:20:35 +05:30 · 284f8cc100
commit 284f8cc100
parent 7af6063bf3
10 changed files with 249 additions and 0 deletions
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/restoreindices/ReindexMLModelConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/restoreindices/ReindexMLModelConfig.java
@ -0,0 +1,30 @@
+package com.linkedin.datahub.upgrade.config.restoreindices;
+
+import com.linkedin.datahub.upgrade.conditions.SystemUpdateCondition;
+import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
+import com.linkedin.datahub.upgrade.system.restoreindices.mlmodel.ReindexMLModel;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Conditional;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+@Conditional(SystemUpdateCondition.NonBlockingSystemUpdateCondition.class)
+public class ReindexMLModelConfig {
+
+  @Bean
+  public NonBlockingSystemUpgrade reindexMLModel(
+      final OperationContext opContext,
+      final EntityService<?> entityService,
+      final AspectDao aspectDao,
+      @Value("${systemUpdate.mlModel.enabled}") final boolean enabled,
+      @Value("${systemUpdate.mlModel.batchSize}") final Integer batchSize,
+      @Value("${systemUpdate.mlModel.delayMs}") final Integer delayMs,
+      @Value("${systemUpdate.mlModel.limit}") final Integer limit) {
+    return new ReindexMLModel(
+        opContext, entityService, aspectDao, enabled, batchSize, delayMs, limit);
+  }
+}
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/restoreindices/ReindexMLModelGroupConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/restoreindices/ReindexMLModelGroupConfig.java
@ -0,0 +1,30 @@
+package com.linkedin.datahub.upgrade.config.restoreindices;
+
+import com.linkedin.datahub.upgrade.conditions.SystemUpdateCondition;
+import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
+import com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup.ReindexMLModelGroup;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Conditional;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+@Conditional(SystemUpdateCondition.NonBlockingSystemUpdateCondition.class)
+public class ReindexMLModelGroupConfig {
+
+  @Bean
+  public NonBlockingSystemUpgrade reindexMLModelGroup(
+      final OperationContext opContext,
+      final EntityService<?> entityService,
+      final AspectDao aspectDao,
+      @Value("${systemUpdate.mlModelGroup.enabled}") final boolean enabled,
+      @Value("${systemUpdate.mlModelGroup.batchSize}") final Integer batchSize,
+      @Value("${systemUpdate.mlModelGroup.delayMs}") final Integer delayMs,
+      @Value("${systemUpdate.mlModelGroup.limit}") final Integer limit) {
+    return new ReindexMLModelGroup(
+        opContext, entityService, aspectDao, enabled, batchSize, delayMs, limit);
+  }
+}
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodel/ReindexMLModel.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodel/ReindexMLModel.java
@ -0,0 +1,46 @@
+package com.linkedin.datahub.upgrade.system.restoreindices.mlmodel;
+
+import com.google.common.collect.ImmutableList;
+import com.linkedin.datahub.upgrade.UpgradeStep;
+import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import java.util.List;
+import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
+
+/** A job that reindexes all mlModel key aspects as part of fixing name field conflicts */
+@Slf4j
+public class ReindexMLModel implements NonBlockingSystemUpgrade {
+
+  private final List<UpgradeStep> _steps;
+
+  public ReindexMLModel(
+      @Nonnull OperationContext opContext,
+      EntityService<?> entityService,
+      AspectDao aspectDao,
+      boolean enabled,
+      Integer batchSize,
+      Integer batchDelayMs,
+      Integer limit) {
+    if (enabled) {
+      _steps =
+          ImmutableList.of(
+              new ReindexMLModelStep(
+                  opContext, entityService, aspectDao, batchSize, batchDelayMs, limit));
+    } else {
+      _steps = ImmutableList.of();
+    }
+  }
+
+  @Override
+  public String id() {
+    return this.getClass().getName();
+  }
+
+  @Override
+  public List<UpgradeStep> steps() {
+    return _steps;
+  }
+}
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodel/ReindexMLModelStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodel/ReindexMLModelStep.java
@ -0,0 +1,42 @@
+package com.linkedin.datahub.upgrade.system.restoreindices.mlmodel;
+
+import static com.linkedin.metadata.Constants.*;
+
+import com.linkedin.datahub.upgrade.system.AbstractMCLStep;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
+import org.jetbrains.annotations.Nullable;
+
+@Slf4j
+public class ReindexMLModelStep extends AbstractMCLStep {
+
+  public ReindexMLModelStep(
+      OperationContext opContext,
+      EntityService<?> entityService,
+      AspectDao aspectDao,
+      Integer batchSize,
+      Integer batchDelayMs,
+      Integer limit) {
+    super(opContext, entityService, aspectDao, batchSize, batchDelayMs, limit);
+  }
+
+  @Override
+  public String id() {
+    return "mlmodel-key-v1";
+  }
+
+  @Nonnull
+  @Override
+  protected String getAspectName() {
+    return ML_MODEL_KEY_ASPECT_NAME;
+  }
+
+  @Nullable
+  @Override
+  protected String getUrnLike() {
+    return "urn:li:" + ML_MODEL_ENTITY_NAME + ":%";
+  }
+}
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodelgroup/ReindexMLModelGroup.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodelgroup/ReindexMLModelGroup.java
@ -0,0 +1,46 @@
+package com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup;
+
+import com.google.common.collect.ImmutableList;
+import com.linkedin.datahub.upgrade.UpgradeStep;
+import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import java.util.List;
+import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
+
+/** A job that reindexes all mlModelGroup key aspects as part of fixing name field conflicts */
+@Slf4j
+public class ReindexMLModelGroup implements NonBlockingSystemUpgrade {
+
+  private final List<UpgradeStep> _steps;
+
+  public ReindexMLModelGroup(
+      @Nonnull OperationContext opContext,
+      EntityService<?> entityService,
+      AspectDao aspectDao,
+      boolean enabled,
+      Integer batchSize,
+      Integer batchDelayMs,
+      Integer limit) {
+    if (enabled) {
+      _steps =
+          ImmutableList.of(
+              new ReindexMLModelGroupStep(
+                  opContext, entityService, aspectDao, batchSize, batchDelayMs, limit));
+    } else {
+      _steps = ImmutableList.of();
+    }
+  }
+
+  @Override
+  public String id() {
+    return this.getClass().getName();
+  }
+
+  @Override
+  public List<UpgradeStep> steps() {
+    return _steps;
+  }
+}
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodelgroup/ReindexMLModelGroupStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/restoreindices/mlmodelgroup/ReindexMLModelGroupStep.java
@ -0,0 +1,42 @@
+package com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup;
+
+import static com.linkedin.metadata.Constants.*;
+
+import com.linkedin.datahub.upgrade.system.AbstractMCLStep;
+import com.linkedin.metadata.entity.AspectDao;
+import com.linkedin.metadata.entity.EntityService;
+import io.datahubproject.metadata.context.OperationContext;
+import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
+import org.jetbrains.annotations.Nullable;
+
+@Slf4j
+public class ReindexMLModelGroupStep extends AbstractMCLStep {
+
+  public ReindexMLModelGroupStep(
+      OperationContext opContext,
+      EntityService<?> entityService,
+      AspectDao aspectDao,
+      Integer batchSize,
+      Integer batchDelayMs,
+      Integer limit) {
+    super(opContext, entityService, aspectDao, batchSize, batchDelayMs, limit);
+  }
+
+  @Override
+  public String id() {
+    return "mlmodelgroup-key-v1";
+  }
+
+  @Nonnull
+  @Override
+  protected String getAspectName() {
+    return ML_MODEL_GROUP_KEY_ASPECT_NAME;
+  }
+
+  @Nullable
+  @Override
+  protected String getUrnLike() {
+    return "urn:li:" + ML_MODEL_GROUP_ENTITY_NAME + ":%";
+  }
+}
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@ -31,6 +31,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

 ### Breaking Changes

+- #15415: MLModel and MLModelGroup search field mapping has been updated to resolve duplicate field name conflicts. Existing entities will be automatically reindexed in the background after upgrade. New MLModel and MLModelGroup entities ingested after the upgrade will work immediately.
 - #15397: Grafana ingestion source dataset granularity changed from per-datasource to per-panel (per visual). This improves lineage accuracy by ensuring each panel's query results in a unique dataset entity with precise upstream/downstream connections. Dataset URN format changed from `{ds_type}.{ds_uid}` to `{ds_type}.{ds_uid}.{dashboard_uid}.{panel_id}`. This means all existing Grafana dataset entities will have different URNs. If stateful ingestion is enabled, running ingestion with the latest CLI version will automatically clean up old entities and create new ones. Otherwise, we recommend soft deleting all Grafana datasets via the DataHub CLI: `datahub delete --platform grafana --soft` and then re-ingesting with the latest CLI version.
 - #15005: `SqlParsingBuilder` is removed, use `SqlParsingAggregator` instead
 - #14710: LookML ingestion source migrated to SDKv2 resulting in:
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
@ -19,6 +19,7 @@ record MLModelGroupKey {
  * Name of the MLModelGroup
  */
  @Searchable = {
+    "fieldName": "id",
    "fieldType": "WORD_GRAM",
    "enableAutocomplete": true,
    "boostScore": 10.0,
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
@ -19,6 +19,7 @@ record MLModelKey {
  * Name of the MLModel
  */
  @Searchable = {
+    "fieldName": "id",
    "fieldType": "WORD_GRAM",
    "enableAutocomplete": true,
    "boostScore": 10.0,
--- a/metadata-service/configuration/src/main/resources/application.yaml
+++ b/metadata-service/configuration/src/main/resources/application.yaml
@ -753,6 +753,16 @@ systemUpdate:
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_BATCH_SIZE:1000}
    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_DELAY_MS:30000}
    limit: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_CLL_LIMIT:0}
+  mlModel:
+    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_ENABLED:true}
+    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_BATCH_SIZE:1000}
+    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_DELAY_MS:30000}
+    limit: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_CLL_LIMIT:0}
+  mlModelGroup:
+    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_ENABLED:true}
+    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_BATCH_SIZE:1000}
+    delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_DELAY_MS:30000}
+    limit: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_CLL_LIMIT:0}
  dashboardInfo:
    enabled: ${BOOTSTRAP_SYSTEM_UPDATE_DASHBOARD_INFO_ENABLED:true}
    batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_DASHBOARD_INFO_BATCH_SIZE:1000}