mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-27 02:46:01 +00:00
feat(lineage): add pdl annotations for upstreams (#14241)
This commit is contained in:
parent
d022df4736
commit
000d7e164e
@ -103,7 +103,7 @@ dependencies {
|
|||||||
testImplementation testFixtures(project(':metadata-io'))
|
testImplementation testFixtures(project(':metadata-io'))
|
||||||
|
|
||||||
constraints {
|
constraints {
|
||||||
implementation(implementation externalDependency.parquetHadoop) {
|
implementation(externalDependency.parquetHadoop) {
|
||||||
because("CVE-2022-42003")
|
because("CVE-2022-42003")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -139,6 +139,25 @@ task run(type: Exec) {
|
|||||||
"-Dserver.port=8083", bootJar.getArchiveFile().get(), "-u", "SystemUpdate"
|
"-Dserver.port=8083", bootJar.getArchiveFile().get(), "-u", "SystemUpdate"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the non-blocking system updates locally (includes lineage index fields backfill)
|
||||||
|
* Sets up environment variables for local execution
|
||||||
|
*/
|
||||||
|
task runNonBlocking(type: Exec) {
|
||||||
|
dependsOn bootJar
|
||||||
|
group = "Execution"
|
||||||
|
description = "Run the non-blocking system updates locally (includes lineage index fields backfill)."
|
||||||
|
environment "ENTITY_REGISTRY_CONFIG_PATH", "../metadata-models/src/main/resources/entity-registry.yml"
|
||||||
|
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE", "true"
|
||||||
|
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX", "true"
|
||||||
|
environment "BOOTSTRAP_SYSTEM_UPDATE_EXECUTOR_POOLS_ENABLED", "false"
|
||||||
|
environment "SCHEMA_REGISTRY_TYPE", "INTERNAL"
|
||||||
|
commandLine "java",
|
||||||
|
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n",
|
||||||
|
"-jar",
|
||||||
|
"-Dserver.port=8083", bootJar.getArchiveFile().get(), "-u", "SystemUpdateNonBlocking"
|
||||||
|
}
|
||||||
|
|
||||||
task runCron(type: Exec) {
|
task runCron(type: Exec) {
|
||||||
dependsOn bootJar
|
dependsOn bootJar
|
||||||
group = "Execution"
|
group = "Execution"
|
||||||
|
|||||||
@ -0,0 +1,29 @@
|
|||||||
|
package com.linkedin.datahub.upgrade.config;
|
||||||
|
|
||||||
|
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
|
||||||
|
import com.linkedin.datahub.upgrade.system.lineage.BackfillDatasetLineageIndexFields;
|
||||||
|
import com.linkedin.metadata.entity.EntityService;
|
||||||
|
import com.linkedin.metadata.search.SearchService;
|
||||||
|
import io.datahubproject.metadata.context.OperationContext;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.Conditional;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
@Configuration
|
||||||
|
@Conditional(SystemUpdateCondition.NonBlockingSystemUpdateCondition.class)
|
||||||
|
public class BackfillDatasetLineageIndexFieldsConfig {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public NonBlockingSystemUpgrade backfillDatasetLineageIndexFields(
|
||||||
|
final OperationContext opContext,
|
||||||
|
final EntityService<?> entityService,
|
||||||
|
final SearchService searchService,
|
||||||
|
@Value("${systemUpdate.lineageIndexFields.enabled:true}") final boolean enabled,
|
||||||
|
@Value("${systemUpdate.lineageIndexFields.reprocess.enabled:false}")
|
||||||
|
final boolean reprocessEnabled,
|
||||||
|
@Value("${systemUpdate.lineageIndexFields.batchSize:100}") final Integer batchSize) {
|
||||||
|
return new BackfillDatasetLineageIndexFields(
|
||||||
|
opContext, entityService, searchService, enabled, reprocessEnabled, batchSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,49 @@
|
|||||||
|
package com.linkedin.datahub.upgrade.system.lineage;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.linkedin.datahub.upgrade.UpgradeStep;
|
||||||
|
import com.linkedin.datahub.upgrade.system.NonBlockingSystemUpgrade;
|
||||||
|
import com.linkedin.metadata.entity.EntityService;
|
||||||
|
import com.linkedin.metadata.search.SearchService;
|
||||||
|
import io.datahubproject.metadata.context.OperationContext;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Non-blocking system upgrade that backfills dataset lineage index fields.
|
||||||
|
*
|
||||||
|
* <p>This upgrade ensures that all datasets have the proper lineage-related fields in their search
|
||||||
|
* index for efficient querying and filtering: - hasUpstreams: indicates if the dataset has any
|
||||||
|
* upstream lineage - hasFineGrainedUpstreams: indicates if the dataset has fine-grained
|
||||||
|
* (column-level) lineage - fineGrainedUpstreams: list of schema field URNs that provide lineage to
|
||||||
|
* this dataset
|
||||||
|
*/
|
||||||
|
public class BackfillDatasetLineageIndexFields implements NonBlockingSystemUpgrade {
|
||||||
|
private final List<UpgradeStep> _steps;
|
||||||
|
|
||||||
|
public BackfillDatasetLineageIndexFields(
|
||||||
|
OperationContext opContext,
|
||||||
|
EntityService<?> entityService,
|
||||||
|
SearchService searchService,
|
||||||
|
boolean enabled,
|
||||||
|
boolean reprocessEnabled,
|
||||||
|
Integer batchSize) {
|
||||||
|
if (enabled) {
|
||||||
|
_steps =
|
||||||
|
ImmutableList.of(
|
||||||
|
new BackfillDatasetLineageIndexFieldsStep(
|
||||||
|
opContext, entityService, searchService, reprocessEnabled, batchSize));
|
||||||
|
} else {
|
||||||
|
_steps = ImmutableList.of();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String id() {
|
||||||
|
return "BackfillDatasetLineageIndexFields";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<UpgradeStep> steps() {
|
||||||
|
return _steps;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,245 @@
|
|||||||
|
package com.linkedin.datahub.upgrade.system.lineage;
|
||||||
|
|
||||||
|
import static com.linkedin.metadata.Constants.*;
|
||||||
|
import static com.linkedin.metadata.utils.CriterionUtils.buildIsNullCriterion;
|
||||||
|
import static com.linkedin.metadata.utils.SystemMetadataUtils.createDefaultSystemMetadata;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.linkedin.common.AuditStamp;
|
||||||
|
import com.linkedin.common.urn.Urn;
|
||||||
|
import com.linkedin.common.urn.UrnUtils;
|
||||||
|
import com.linkedin.data.DataMap;
|
||||||
|
import com.linkedin.datahub.upgrade.UpgradeContext;
|
||||||
|
import com.linkedin.datahub.upgrade.UpgradeStep;
|
||||||
|
import com.linkedin.datahub.upgrade.UpgradeStepResult;
|
||||||
|
import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult;
|
||||||
|
import com.linkedin.dataset.UpstreamLineage;
|
||||||
|
import com.linkedin.entity.EntityResponse;
|
||||||
|
import com.linkedin.events.metadata.ChangeType;
|
||||||
|
import com.linkedin.metadata.Constants;
|
||||||
|
import com.linkedin.metadata.boot.BootstrapStep;
|
||||||
|
import com.linkedin.metadata.entity.EntityService;
|
||||||
|
import com.linkedin.metadata.query.filter.ConjunctiveCriterion;
|
||||||
|
import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray;
|
||||||
|
import com.linkedin.metadata.query.filter.Criterion;
|
||||||
|
import com.linkedin.metadata.query.filter.CriterionArray;
|
||||||
|
import com.linkedin.metadata.query.filter.Filter;
|
||||||
|
import com.linkedin.metadata.search.ScrollResult;
|
||||||
|
import com.linkedin.metadata.search.SearchEntity;
|
||||||
|
import com.linkedin.metadata.search.SearchService;
|
||||||
|
import com.linkedin.upgrade.DataHubUpgradeState;
|
||||||
|
import io.datahubproject.metadata.context.OperationContext;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
/** This bootstrap step is responsible for backfilling dataset lineage index fields in ES */
|
||||||
|
@Slf4j
|
||||||
|
public class BackfillDatasetLineageIndexFieldsStep implements UpgradeStep {
|
||||||
|
private static final String UPGRADE_ID = "BackfillDatasetLineageIndexFieldsStep_V1";
|
||||||
|
private static final Urn UPGRADE_ID_URN = BootstrapStep.getUpgradeUrn(UPGRADE_ID);
|
||||||
|
|
||||||
|
private final OperationContext opContext;
|
||||||
|
private final boolean reprocessEnabled;
|
||||||
|
private final Integer batchSize;
|
||||||
|
private final EntityService<?> entityService;
|
||||||
|
private final SearchService _searchService;
|
||||||
|
|
||||||
|
public BackfillDatasetLineageIndexFieldsStep(
|
||||||
|
OperationContext opContext,
|
||||||
|
EntityService<?> entityService,
|
||||||
|
SearchService searchService,
|
||||||
|
boolean reprocessEnabled,
|
||||||
|
Integer batchSize) {
|
||||||
|
this.opContext = opContext;
|
||||||
|
this.entityService = entityService;
|
||||||
|
this._searchService = searchService;
|
||||||
|
this.reprocessEnabled = reprocessEnabled;
|
||||||
|
this.batchSize = batchSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String id() {
|
||||||
|
return UPGRADE_ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Function<UpgradeContext, UpgradeStepResult> executable() {
|
||||||
|
return (context) -> {
|
||||||
|
final AuditStamp auditStamp =
|
||||||
|
new AuditStamp()
|
||||||
|
.setActor(UrnUtils.getUrn(Constants.SYSTEM_ACTOR))
|
||||||
|
.setTime(System.currentTimeMillis());
|
||||||
|
|
||||||
|
String scrollId = null;
|
||||||
|
int migratedCount = 0;
|
||||||
|
do {
|
||||||
|
log.info(
|
||||||
|
"Backfilling lineage index fields for batch of datasets {}-{}",
|
||||||
|
migratedCount,
|
||||||
|
migratedCount + batchSize);
|
||||||
|
scrollId = backfillDatasetLineageFields(context, auditStamp, scrollId);
|
||||||
|
migratedCount += batchSize;
|
||||||
|
} while (scrollId != null);
|
||||||
|
|
||||||
|
BootstrapStep.setUpgradeResult(context.opContext(), UPGRADE_ID_URN, entityService);
|
||||||
|
|
||||||
|
return new DefaultUpgradeStepResult(id(), DataHubUpgradeState.SUCCEEDED);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the upgrade should proceed if the step fails after exceeding the maximum
|
||||||
|
* retries.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean isOptional() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the upgrade should be skipped. Uses previous run history or the environment
|
||||||
|
* variable to determine whether to skip.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean skip(UpgradeContext context) {
|
||||||
|
|
||||||
|
if (reprocessEnabled) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean previouslyRun =
|
||||||
|
entityService.exists(
|
||||||
|
context.opContext(), UPGRADE_ID_URN, DATA_HUB_UPGRADE_RESULT_ASPECT_NAME, true);
|
||||||
|
if (previouslyRun) {
|
||||||
|
log.info("{} was already run. Skipping.", id());
|
||||||
|
}
|
||||||
|
return previouslyRun;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String backfillDatasetLineageFields(
|
||||||
|
UpgradeContext context, AuditStamp auditStamp, String scrollId) {
|
||||||
|
|
||||||
|
final Filter filter = backfillLineageFieldFilter();
|
||||||
|
final ScrollResult scrollResult =
|
||||||
|
_searchService.scrollAcrossEntities(
|
||||||
|
opContext.withSearchFlags(
|
||||||
|
flags ->
|
||||||
|
flags
|
||||||
|
.setFulltext(true)
|
||||||
|
.setSkipCache(true)
|
||||||
|
.setSkipHighlighting(true)
|
||||||
|
.setSkipAggregates(true)),
|
||||||
|
ImmutableList.of(Constants.DATASET_ENTITY_NAME),
|
||||||
|
"*",
|
||||||
|
filter,
|
||||||
|
null,
|
||||||
|
scrollId,
|
||||||
|
null,
|
||||||
|
batchSize,
|
||||||
|
null);
|
||||||
|
|
||||||
|
if (scrollResult.getNumEntities() == 0 || scrollResult.getEntities().isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Future<?>> futures = new LinkedList<>();
|
||||||
|
for (SearchEntity searchEntity : scrollResult.getEntities()) {
|
||||||
|
try {
|
||||||
|
restateUpstreamLineage(context, searchEntity.getEntity(), auditStamp)
|
||||||
|
.ifPresent(futures::add);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// don't stop the whole step because of one bad urn or one bad ingestion
|
||||||
|
log.error("Error restating upstreamLineage aspect for urn {}", searchEntity.getEntity(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
futures.forEach(
|
||||||
|
f -> {
|
||||||
|
try {
|
||||||
|
f.get();
|
||||||
|
} catch (InterruptedException | ExecutionException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return scrollResult.getScrollId();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Filter backfillLineageFieldFilter() {
|
||||||
|
// Condition: Does not have at least 1 of: `hasUpstreams`, `hasFineGrainedUpstreams`,
|
||||||
|
// `fineGrainedUpstreams`
|
||||||
|
ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray();
|
||||||
|
|
||||||
|
conjunctiveCriterionArray.add(getCriterionForMissingField("hasUpstreams"));
|
||||||
|
conjunctiveCriterionArray.add(getCriterionForMissingField("hasFineGrainedUpstreams"));
|
||||||
|
conjunctiveCriterionArray.add(getCriterionForMissingField("fineGrainedUpstreams"));
|
||||||
|
|
||||||
|
Filter filter = new Filter();
|
||||||
|
filter.setOr(conjunctiveCriterionArray);
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<Future<?>> restateUpstreamLineage(
|
||||||
|
UpgradeContext context, Urn urn, AuditStamp auditStamp) {
|
||||||
|
EntityResponse entityResponse = null;
|
||||||
|
try {
|
||||||
|
entityResponse =
|
||||||
|
entityService.getEntityV2(
|
||||||
|
context.opContext(),
|
||||||
|
urn.getEntityType(),
|
||||||
|
urn,
|
||||||
|
Collections.singleton(UPSTREAM_LINEAGE_ASPECT_NAME));
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
log.error(
|
||||||
|
"Error getting UpstreamLineage for entity with urn {} while restating lineage information",
|
||||||
|
urn,
|
||||||
|
e);
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entityResponse != null
|
||||||
|
&& entityResponse.getAspects().containsKey(UPSTREAM_LINEAGE_ASPECT_NAME)) {
|
||||||
|
final DataMap dataMap =
|
||||||
|
entityResponse.getAspects().get(UPSTREAM_LINEAGE_ASPECT_NAME).getValue().data();
|
||||||
|
final UpstreamLineage upstreamLineage = new UpstreamLineage(dataMap);
|
||||||
|
|
||||||
|
log.debug("Restating upstreamLineage for dataset urn {} with value {}", urn, upstreamLineage);
|
||||||
|
return Optional.of(
|
||||||
|
entityService
|
||||||
|
.alwaysProduceMCLAsync(
|
||||||
|
context.opContext(),
|
||||||
|
urn,
|
||||||
|
urn.getEntityType(),
|
||||||
|
UPSTREAM_LINEAGE_ASPECT_NAME,
|
||||||
|
opContext.getEntityRegistry().getAspectSpecs().get(UPSTREAM_LINEAGE_ASPECT_NAME),
|
||||||
|
null,
|
||||||
|
upstreamLineage,
|
||||||
|
null,
|
||||||
|
createDefaultSystemMetadata(),
|
||||||
|
auditStamp,
|
||||||
|
ChangeType.RESTATE)
|
||||||
|
.getFirst());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private static ConjunctiveCriterion getCriterionForMissingField(String field) {
|
||||||
|
final Criterion missingField = buildIsNullCriterion(field);
|
||||||
|
|
||||||
|
final CriterionArray criterionArray = new CriterionArray();
|
||||||
|
criterionArray.add(missingField);
|
||||||
|
final ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion();
|
||||||
|
conjunctiveCriterion.setAnd(criterionArray);
|
||||||
|
return conjunctiveCriterion;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
package com.linkedin.datahub.upgrade.system.lineage;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.testng.Assert.assertEquals;
|
||||||
|
import static org.testng.Assert.assertFalse;
|
||||||
|
import static org.testng.Assert.assertTrue;
|
||||||
|
|
||||||
|
import com.linkedin.metadata.entity.EntityService;
|
||||||
|
import com.linkedin.metadata.search.SearchService;
|
||||||
|
import io.datahubproject.metadata.context.OperationContext;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
public class BackfillDatasetLineageIndexFieldsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBackfillJobInstantiation() {
|
||||||
|
// Mock dependencies
|
||||||
|
OperationContext opContext = mock(OperationContext.class);
|
||||||
|
EntityService<?> entityService = mock(EntityService.class);
|
||||||
|
SearchService searchService = mock(SearchService.class);
|
||||||
|
|
||||||
|
// Test that the backfill job can be instantiated
|
||||||
|
BackfillDatasetLineageIndexFields backfillJob =
|
||||||
|
new BackfillDatasetLineageIndexFields(
|
||||||
|
opContext,
|
||||||
|
entityService,
|
||||||
|
searchService,
|
||||||
|
true, // enabled
|
||||||
|
false, // reprocessEnabled
|
||||||
|
100 // batchSize
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the job has steps when enabled
|
||||||
|
assertEquals(backfillJob.steps().size(), 1);
|
||||||
|
assertEquals(backfillJob.id(), "BackfillDatasetLineageIndexFields");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBackfillJobDisabled() {
|
||||||
|
// Mock dependencies
|
||||||
|
OperationContext opContext = mock(OperationContext.class);
|
||||||
|
EntityService<?> entityService = mock(EntityService.class);
|
||||||
|
SearchService searchService = mock(SearchService.class);
|
||||||
|
|
||||||
|
// Test that the backfill job has no steps when disabled
|
||||||
|
BackfillDatasetLineageIndexFields backfillJob =
|
||||||
|
new BackfillDatasetLineageIndexFields(
|
||||||
|
opContext,
|
||||||
|
entityService,
|
||||||
|
searchService,
|
||||||
|
false, // enabled
|
||||||
|
false, // reprocessEnabled
|
||||||
|
100 // batchSize
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the job has no steps when disabled
|
||||||
|
assertEquals(backfillJob.steps().size(), 0);
|
||||||
|
assertEquals(backfillJob.id(), "BackfillDatasetLineageIndexFields");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBackfillStepInstantiation() {
|
||||||
|
// Mock dependencies
|
||||||
|
OperationContext opContext = mock(OperationContext.class);
|
||||||
|
EntityService<?> entityService = mock(EntityService.class);
|
||||||
|
SearchService searchService = mock(SearchService.class);
|
||||||
|
|
||||||
|
// Test that the backfill step can be instantiated
|
||||||
|
BackfillDatasetLineageIndexFieldsStep backfillStep =
|
||||||
|
new BackfillDatasetLineageIndexFieldsStep(
|
||||||
|
opContext,
|
||||||
|
entityService,
|
||||||
|
searchService,
|
||||||
|
false, // reprocessEnabled
|
||||||
|
100 // batchSize
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the step properties
|
||||||
|
assertEquals(backfillStep.id(), "BackfillDatasetLineageIndexFieldsStep_V1");
|
||||||
|
assertTrue(backfillStep.isOptional());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBackfillStepSkipLogic() {
|
||||||
|
// Mock dependencies
|
||||||
|
OperationContext opContext = mock(OperationContext.class);
|
||||||
|
EntityService<?> entityService = mock(EntityService.class);
|
||||||
|
SearchService searchService = mock(SearchService.class);
|
||||||
|
|
||||||
|
// Test with reprocessEnabled = true
|
||||||
|
BackfillDatasetLineageIndexFieldsStep backfillStep =
|
||||||
|
new BackfillDatasetLineageIndexFieldsStep(
|
||||||
|
opContext,
|
||||||
|
entityService,
|
||||||
|
searchService,
|
||||||
|
true, // reprocessEnabled
|
||||||
|
100 // batchSize
|
||||||
|
);
|
||||||
|
|
||||||
|
// When reprocessEnabled is true, skip should return false
|
||||||
|
assertFalse(backfillStep.skip(null));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -18,6 +18,14 @@ record FineGrainedLineage {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// The relationship name is kept same as that of existing relationship in Upstream.pdl
|
// The relationship name is kept same as that of existing relationship in Upstream.pdl
|
||||||
|
@Searchable = {
|
||||||
|
"/*": {
|
||||||
|
"fieldName": "fineGrainedUpstreams",
|
||||||
|
"fieldType": "URN",
|
||||||
|
"hasValuesFieldName": "hasFineGrainedUpstreams",
|
||||||
|
"queryByDefault": false
|
||||||
|
}
|
||||||
|
}
|
||||||
upstreams: optional array[Urn]
|
upstreams: optional array[Urn]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -27,7 +35,7 @@ record FineGrainedLineage {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Downstream fields in the lineage
|
* Downstream fields in the lineage
|
||||||
*/
|
*/
|
||||||
downstreams: optional array[Urn]
|
downstreams: optional array[Urn]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -39,6 +39,7 @@ record Upstream {
|
|||||||
}
|
}
|
||||||
@Searchable = {
|
@Searchable = {
|
||||||
"fieldName": "upstreams",
|
"fieldName": "upstreams",
|
||||||
|
"hasValuesFieldName": "hasUpstreams",
|
||||||
"fieldType": "URN",
|
"fieldType": "URN",
|
||||||
"queryByDefault": false
|
"queryByDefault": false
|
||||||
}
|
}
|
||||||
|
|||||||
0
smoke-test/tests/search/__init__.py
Normal file
0
smoke-test/tests/search/__init__.py
Normal file
239
smoke-test/tests/search/test_lineage_search_index_fields.py
Normal file
239
smoke-test/tests/search/test_lineage_search_index_fields.py
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import tenacity
|
||||||
|
|
||||||
|
from tests.utils import delete_urns_from_file, ingest_file_via_rest
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Test constants
|
||||||
|
UPSTREAM_DATASET_URN = "urn:li:dataset:(urn:li:dataPlatform:hive,upstream_table,PROD)"
|
||||||
|
DOWNSTREAM_DATASET_URN = (
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:hive,downstream_table,PROD)"
|
||||||
|
)
|
||||||
|
DATASET_WITHOUT_LINEAGE_URN = (
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:hive,no_lineage_table,PROD)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_upstream_dataset_mcp_data() -> Dict[str, Any]:
|
||||||
|
"""Create the MCP data for the upstream dataset."""
|
||||||
|
return {
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": UPSTREAM_DATASET_URN,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProperties",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"name": "upstream_table",
|
||||||
|
"description": "Upstream dataset for lineage testing",
|
||||||
|
"customProperties": {"platform": "hive", "env": "PROD"},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_downstream_dataset_with_lineage_mcp_data() -> Dict[str, Any]:
|
||||||
|
"""Create the MCP data for the downstream dataset with upstream lineage."""
|
||||||
|
return {
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": DOWNSTREAM_DATASET_URN,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProperties",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"name": "downstream_table",
|
||||||
|
"description": "Downstream dataset with lineage",
|
||||||
|
"customProperties": {"platform": "hive", "env": "PROD"},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_upstream_lineage_mcp_data() -> Dict[str, Any]:
|
||||||
|
"""Create the MCP data for upstream lineage aspect."""
|
||||||
|
return {
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": DOWNSTREAM_DATASET_URN,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "upstreamLineage",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"upstreams": [
|
||||||
|
{
|
||||||
|
"dataset": UPSTREAM_DATASET_URN,
|
||||||
|
"type": "TRANSFORMED",
|
||||||
|
"auditStamp": {
|
||||||
|
"time": 1640995200000,
|
||||||
|
"actor": "urn:li:corpuser:datahub",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset_without_lineage_mcp_data() -> Dict[str, Any]:
|
||||||
|
"""Create the MCP data for a dataset without lineage."""
|
||||||
|
return {
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": DATASET_WITHOUT_LINEAGE_URN,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProperties",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"name": "no_lineage_table",
|
||||||
|
"description": "Dataset without lineage",
|
||||||
|
"customProperties": {"platform": "hive", "env": "PROD"},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@tenacity.retry(
|
||||||
|
stop=tenacity.stop_after_attempt(5),
|
||||||
|
wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
|
||||||
|
retry=tenacity.retry_if_exception_type(AssertionError),
|
||||||
|
)
|
||||||
|
def verify_search_index_fields_via_openapi(
|
||||||
|
auth_session,
|
||||||
|
dataset_urn: str,
|
||||||
|
expected_has_upstreams: bool,
|
||||||
|
expected_has_fine_grained_upstreams: bool,
|
||||||
|
expected_fine_grained_upstreams: Optional[List[str]] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Verify search index fields using the OpenAPI endpoint."""
|
||||||
|
logger.info(
|
||||||
|
f"Checking search index fields for {dataset_urn} via OpenAPI endpoint..."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use the OpenAPI endpoint to get raw Elasticsearch document
|
||||||
|
openapi_url = (
|
||||||
|
f"{auth_session.gms_url()}/openapi/operations/elasticSearch/entity/raw"
|
||||||
|
)
|
||||||
|
response = auth_session.post(
|
||||||
|
openapi_url,
|
||||||
|
json=[dataset_urn],
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
raw_documents = response.json()
|
||||||
|
logger.info(f"Raw documents response: {raw_documents}")
|
||||||
|
|
||||||
|
if dataset_urn not in raw_documents:
|
||||||
|
raise AssertionError(f"Dataset {dataset_urn} not found in raw documents")
|
||||||
|
|
||||||
|
document = raw_documents[dataset_urn]
|
||||||
|
logger.info(f"Raw document for {dataset_urn}: {document}")
|
||||||
|
|
||||||
|
# Check hasUpstreams field
|
||||||
|
has_upstreams = document.get("hasUpstreams", False)
|
||||||
|
logger.info(f"hasUpstreams field: {has_upstreams}")
|
||||||
|
assert has_upstreams == expected_has_upstreams, (
|
||||||
|
f"Expected hasUpstreams={expected_has_upstreams}, got {has_upstreams}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check hasFineGrainedUpstreams field
|
||||||
|
has_fine_grained_upstreams = document.get("hasFineGrainedUpstreams", False)
|
||||||
|
logger.info(f"hasFineGrainedUpstreams field: {has_fine_grained_upstreams}")
|
||||||
|
assert has_fine_grained_upstreams == expected_has_fine_grained_upstreams, (
|
||||||
|
f"Expected hasFineGrainedUpstreams={expected_has_fine_grained_upstreams}, got {has_fine_grained_upstreams}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check fineGrainedUpstreams field if expected
|
||||||
|
if expected_fine_grained_upstreams is not None:
|
||||||
|
fine_grained_upstreams = document.get("fineGrainedUpstreams", [])
|
||||||
|
logger.info(f"fineGrainedUpstreams field: {fine_grained_upstreams}")
|
||||||
|
assert set(fine_grained_upstreams) == set(
|
||||||
|
expected_fine_grained_upstreams
|
||||||
|
), (
|
||||||
|
f"Expected fineGrainedUpstreams={expected_fine_grained_upstreams}, got {fine_grained_upstreams}"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Search index field verification successful for {dataset_urn}")
|
||||||
|
return {
|
||||||
|
"urn": dataset_urn,
|
||||||
|
"hasUpstreams": has_upstreams,
|
||||||
|
"hasFineGrainedUpstreams": has_fine_grained_upstreams,
|
||||||
|
"fineGrainedUpstreams": document.get("fineGrainedUpstreams", []),
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Could not verify search index fields via OpenAPI: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", autouse=True)
|
||||||
|
def ingest_cleanup_data(auth_session, graph_client, request):
|
||||||
|
"""Fixture to ingest test data and clean up after tests."""
|
||||||
|
# Create temporary file for MCP data
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||||
|
mcp_data = [
|
||||||
|
create_upstream_dataset_mcp_data(),
|
||||||
|
create_downstream_dataset_with_lineage_mcp_data(),
|
||||||
|
create_upstream_lineage_mcp_data(),
|
||||||
|
create_dataset_without_lineage_mcp_data(),
|
||||||
|
]
|
||||||
|
json.dump(mcp_data, f, indent=2)
|
||||||
|
temp_file_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info("Ingesting lineage test data")
|
||||||
|
ingest_file_via_rest(auth_session, temp_file_path)
|
||||||
|
yield
|
||||||
|
logger.info("Removing lineage test data")
|
||||||
|
delete_urns_from_file(graph_client, temp_file_path)
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file
|
||||||
|
if os.path.exists(temp_file_path):
|
||||||
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_lineage_search_index_fields_with_lineage(auth_session):
|
||||||
|
"""
|
||||||
|
Test that verifies search index fields are correctly populated for a dataset with lineage.
|
||||||
|
"""
|
||||||
|
# Verify that the downstream dataset has the correct search index fields
|
||||||
|
verify_search_index_fields_via_openapi(
|
||||||
|
auth_session,
|
||||||
|
dataset_urn=DOWNSTREAM_DATASET_URN,
|
||||||
|
expected_has_upstreams=True,
|
||||||
|
expected_has_fine_grained_upstreams=False,
|
||||||
|
expected_fine_grained_upstreams=[], # No fine-grained lineage in this test
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_lineage_search_index_fields_without_lineage(auth_session):
|
||||||
|
"""
|
||||||
|
Test that verifies search index fields are correctly populated for a dataset without lineage.
|
||||||
|
"""
|
||||||
|
# Verify that the dataset without lineage has the correct search index fields
|
||||||
|
verify_search_index_fields_via_openapi(
|
||||||
|
auth_session,
|
||||||
|
dataset_urn=DATASET_WITHOUT_LINEAGE_URN,
|
||||||
|
expected_has_upstreams=False,
|
||||||
|
expected_has_fine_grained_upstreams=False,
|
||||||
|
expected_fine_grained_upstreams=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_upstream_dataset_search_index_fields(auth_session):
|
||||||
|
"""
|
||||||
|
Test that verifies search index fields for the upstream dataset (should not have upstreams).
|
||||||
|
"""
|
||||||
|
# Verify that the upstream dataset has the correct search index fields
|
||||||
|
verify_search_index_fields_via_openapi(
|
||||||
|
auth_session,
|
||||||
|
dataset_urn=UPSTREAM_DATASET_URN,
|
||||||
|
expected_has_upstreams=False,
|
||||||
|
expected_has_fine_grained_upstreams=False,
|
||||||
|
expected_fine_grained_upstreams=[],
|
||||||
|
)
|
||||||
Loading…
x
Reference in New Issue
Block a user