fix(search): fix name conflicts in search(mlmodel) (#15415)

This commit is contained in:
Deepak Garg 2025-12-01 18:20:35 +05:30 committed by GitHub
parent 7af6063bf3
commit 284f8cc100
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 249 additions and 0 deletions

View File

@ -0,0 +1,30 @@
package com.linkedin.datahub.upgrade.config.restoreindices;
import com.linkedin.datahub.upgrade.conditions.SystemUpdateCondition;
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
import com.linkedin.datahub.upgrade.system.restoreindices.mlmodel.ReindexMLModel;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Conditional;
import org.springframework.context.annotation.Configuration;
@Configuration
@Conditional(SystemUpdateCondition.NonBlockingSystemUpdateCondition.class)
public class ReindexMLModelConfig {
@Bean
public NonBlockingSystemUpgrade reindexMLModel(
final OperationContext opContext,
final EntityService<?> entityService,
final AspectDao aspectDao,
@Value("${systemUpdate.mlModel.enabled}") final boolean enabled,
@Value("${systemUpdate.mlModel.batchSize}") final Integer batchSize,
@Value("${systemUpdate.mlModel.delayMs}") final Integer delayMs,
@Value("${systemUpdate.mlModel.limit}") final Integer limit) {
return new ReindexMLModel(
opContext, entityService, aspectDao, enabled, batchSize, delayMs, limit);
}
}

View File

@ -0,0 +1,30 @@
package com.linkedin.datahub.upgrade.config.restoreindices;
import com.linkedin.datahub.upgrade.conditions.SystemUpdateCondition;
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
import com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup.ReindexMLModelGroup;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Conditional;
import org.springframework.context.annotation.Configuration;
@Configuration
@Conditional(SystemUpdateCondition.NonBlockingSystemUpdateCondition.class)
public class ReindexMLModelGroupConfig {
@Bean
public NonBlockingSystemUpgrade reindexMLModelGroup(
final OperationContext opContext,
final EntityService<?> entityService,
final AspectDao aspectDao,
@Value("${systemUpdate.mlModelGroup.enabled}") final boolean enabled,
@Value("${systemUpdate.mlModelGroup.batchSize}") final Integer batchSize,
@Value("${systemUpdate.mlModelGroup.delayMs}") final Integer delayMs,
@Value("${systemUpdate.mlModelGroup.limit}") final Integer limit) {
return new ReindexMLModelGroup(
opContext, entityService, aspectDao, enabled, batchSize, delayMs, limit);
}
}

View File

@ -0,0 +1,46 @@
package com.linkedin.datahub.upgrade.system.restoreindices.mlmodel;
import com.google.common.collect.ImmutableList;
import com.linkedin.datahub.upgrade.UpgradeStep;
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import java.util.List;
import javax.annotation.Nonnull;
import lombok.extern.slf4j.Slf4j;
/** A job that reindexes all mlModel key aspects as part of fixing name field conflicts */
@Slf4j
public class ReindexMLModel implements NonBlockingSystemUpgrade {
private final List<UpgradeStep> _steps;
public ReindexMLModel(
@Nonnull OperationContext opContext,
EntityService<?> entityService,
AspectDao aspectDao,
boolean enabled,
Integer batchSize,
Integer batchDelayMs,
Integer limit) {
if (enabled) {
_steps =
ImmutableList.of(
new ReindexMLModelStep(
opContext, entityService, aspectDao, batchSize, batchDelayMs, limit));
} else {
_steps = ImmutableList.of();
}
}
@Override
public String id() {
return this.getClass().getName();
}
@Override
public List<UpgradeStep> steps() {
return _steps;
}
}

View File

@ -0,0 +1,42 @@
package com.linkedin.datahub.upgrade.system.restoreindices.mlmodel;
import static com.linkedin.metadata.Constants.*;
import com.linkedin.datahub.upgrade.system.AbstractMCLStep;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import javax.annotation.Nonnull;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.Nullable;
@Slf4j
public class ReindexMLModelStep extends AbstractMCLStep {
public ReindexMLModelStep(
OperationContext opContext,
EntityService<?> entityService,
AspectDao aspectDao,
Integer batchSize,
Integer batchDelayMs,
Integer limit) {
super(opContext, entityService, aspectDao, batchSize, batchDelayMs, limit);
}
@Override
public String id() {
return "mlmodel-key-v1";
}
@Nonnull
@Override
protected String getAspectName() {
return ML_MODEL_KEY_ASPECT_NAME;
}
@Nullable
@Override
protected String getUrnLike() {
return "urn:li:" + ML_MODEL_ENTITY_NAME + ":%";
}
}

View File

@ -0,0 +1,46 @@
package com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup;
import com.google.common.collect.ImmutableList;
import com.linkedin.datahub.upgrade.UpgradeStep;
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import java.util.List;
import javax.annotation.Nonnull;
import lombok.extern.slf4j.Slf4j;
/** A job that reindexes all mlModelGroup key aspects as part of fixing name field conflicts */
@Slf4j
public class ReindexMLModelGroup implements NonBlockingSystemUpgrade {
private final List<UpgradeStep> _steps;
public ReindexMLModelGroup(
@Nonnull OperationContext opContext,
EntityService<?> entityService,
AspectDao aspectDao,
boolean enabled,
Integer batchSize,
Integer batchDelayMs,
Integer limit) {
if (enabled) {
_steps =
ImmutableList.of(
new ReindexMLModelGroupStep(
opContext, entityService, aspectDao, batchSize, batchDelayMs, limit));
} else {
_steps = ImmutableList.of();
}
}
@Override
public String id() {
return this.getClass().getName();
}
@Override
public List<UpgradeStep> steps() {
return _steps;
}
}

View File

@ -0,0 +1,42 @@
package com.linkedin.datahub.upgrade.system.restoreindices.mlmodelgroup;
import static com.linkedin.metadata.Constants.*;
import com.linkedin.datahub.upgrade.system.AbstractMCLStep;
import com.linkedin.metadata.entity.AspectDao;
import com.linkedin.metadata.entity.EntityService;
import io.datahubproject.metadata.context.OperationContext;
import javax.annotation.Nonnull;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.Nullable;
@Slf4j
public class ReindexMLModelGroupStep extends AbstractMCLStep {
public ReindexMLModelGroupStep(
OperationContext opContext,
EntityService<?> entityService,
AspectDao aspectDao,
Integer batchSize,
Integer batchDelayMs,
Integer limit) {
super(opContext, entityService, aspectDao, batchSize, batchDelayMs, limit);
}
@Override
public String id() {
return "mlmodelgroup-key-v1";
}
@Nonnull
@Override
protected String getAspectName() {
return ML_MODEL_GROUP_KEY_ASPECT_NAME;
}
@Nullable
@Override
protected String getUrnLike() {
return "urn:li:" + ML_MODEL_GROUP_ENTITY_NAME + ":%";
}
}

View File

@ -31,6 +31,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
### Breaking Changes
- #15415: MLModel and MLModelGroup search field mapping has been updated to resolve duplicate field name conflicts. Existing entities will be automatically reindexed in the background after upgrade. New MLModel and MLModelGroup entities ingested after the upgrade will work immediately.
- #15397: Grafana ingestion source dataset granularity changed from per-datasource to per-panel (per visual). This improves lineage accuracy by ensuring each panel's query results in a unique dataset entity with precise upstream/downstream connections. Dataset URN format changed from `{ds_type}.{ds_uid}` to `{ds_type}.{ds_uid}.{dashboard_uid}.{panel_id}`. This means all existing Grafana dataset entities will have different URNs. If stateful ingestion is enabled, running ingestion with the latest CLI version will automatically clean up old entities and create new ones. Otherwise, we recommend soft deleting all Grafana datasets via the DataHub CLI: `datahub delete --platform grafana --soft` and then re-ingesting with the latest CLI version.
- #15005: `SqlParsingBuilder` is removed, use `SqlParsingAggregator` instead
- #14710: LookML ingestion source migrated to SDKv2 resulting in:

View File

@ -19,6 +19,7 @@ record MLModelGroupKey {
* Name of the MLModelGroup
*/
@Searchable = {
"fieldName": "id",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0,

View File

@ -19,6 +19,7 @@ record MLModelKey {
* Name of the MLModel
*/
@Searchable = {
"fieldName": "id",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0,

View File

@ -753,6 +753,16 @@ systemUpdate:
batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_BATCH_SIZE:1000}
delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_DELAY_MS:30000}
limit: ${BOOTSTRAP_SYSTEM_UPDATE_CHART_INFO_CLL_LIMIT:0}
mlModel:
enabled: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_ENABLED:true}
batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_BATCH_SIZE:1000}
delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_DELAY_MS:30000}
limit: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_CLL_LIMIT:0}
mlModelGroup:
enabled: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_ENABLED:true}
batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_BATCH_SIZE:1000}
delayMs: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_DELAY_MS:30000}
limit: ${BOOTSTRAP_SYSTEM_UPDATE_ML_MODEL_GROUP_CLL_LIMIT:0}
dashboardInfo:
enabled: ${BOOTSTRAP_SYSTEM_UPDATE_DASHBOARD_INFO_ENABLED:true}
batchSize: ${BOOTSTRAP_SYSTEM_UPDATE_DASHBOARD_INFO_BATCH_SIZE:1000}