refactor search index builder to store urn parts efficiently (#1937) (#1972)

* refactor search index builder to store urn parts efficiently (#1937)

Co-authored-by: Jyoti Wadhwani <jywadhwa@linkedin.com>

* set urn for all documents

* rebase, fix merge conflicts and modify tests

Co-authored-by: Jyoti Wadhwani <jywadhwa@linkedin.com>
This commit is contained in:
Jyoti Wadhwani 2020-10-29 09:52:41 -07:00 committed by GitHub
parent 32133cd6f2
commit 0c92a8e887
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 71 deletions

View File

@ -1,7 +1,6 @@
package com.linkedin.metadata.builders.search;
import com.linkedin.common.Ownership;
import com.linkedin.common.Status;
import com.linkedin.common.urn.DataProcessUrn;
import com.linkedin.data.template.RecordTemplate;
import com.linkedin.data.template.StringArray;
@ -42,7 +41,8 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder<DataProcessDocumen
@Nonnull
private DataProcessDocument getDocumentToUpdateFromAspect(@Nonnull DataProcessUrn urn, @Nonnull Ownership ownership) {
final StringArray owners = BuilderUtils.getCorpUserOwners(ownership);
return setUrnDerivedFields(urn)
return new DataProcessDocument()
.setUrn(urn)
.setHasOwners(!owners.isEmpty())
.setOwners(owners);
}
@ -50,7 +50,7 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder<DataProcessDocumen
@Nonnull
private DataProcessDocument getDocumentToUpdateFromAspect(@Nonnull DataProcessUrn urn,
@Nonnull DataProcessInfo dataProcessInfo) {
DataProcessDocument dataProcessDocument = setUrnDerivedFields(urn);
final DataProcessDocument dataProcessDocument = new DataProcessDocument().setUrn(urn);
if (dataProcessInfo.getInputs() != null) {
dataProcessDocument.setInputs(dataProcessInfo.getInputs())
.setNumInputDatasets(dataProcessInfo.getInputs().size());
@ -62,16 +62,10 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder<DataProcessDocumen
return dataProcessDocument;
}
@Nonnull
private DataProcessDocument getDocumentToUpdateFromAspect(@Nonnull DataProcessUrn urn, @Nonnull Status status) {
return setUrnDerivedFields(urn)
.setRemoved(status.isRemoved());
}
@Nonnull
private List<DataProcessDocument> getDocumentsToUpdateFromSnapshotType(@Nonnull DataProcessSnapshot dataProcessSnapshot) {
DataProcessUrn urn = dataProcessSnapshot.getUrn();
return dataProcessSnapshot.getAspects().stream().map(aspect -> {
final DataProcessUrn urn = dataProcessSnapshot.getUrn();
final List<DataProcessDocument> documents = dataProcessSnapshot.getAspects().stream().map(aspect -> {
if (aspect.isDataProcessInfo()) {
return getDocumentToUpdateFromAspect(urn, aspect.getDataProcessInfo());
} else if (aspect.isOwnership()) {
@ -79,6 +73,8 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder<DataProcessDocumen
}
return null;
}).filter(Objects::nonNull).collect(Collectors.toList());
documents.add(setUrnDerivedFields(urn));
return documents;
}
@Nullable

View File

@ -51,51 +51,54 @@ public class DatasetIndexBuilder extends BaseIndexBuilder<DatasetDocument> {
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull Ownership ownership) {
final StringArray owners = BuilderUtils.getCorpUserOwners(ownership);
return setUrnDerivedFields(urn)
return new DatasetDocument()
.setUrn(urn)
.setHasOwners(!owners.isEmpty())
.setOwners(owners);
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull Status status) {
return setUrnDerivedFields(urn)
return new DatasetDocument()
.setUrn(urn)
.setRemoved(status.isRemoved());
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull DatasetDeprecation deprecation) {
return setUrnDerivedFields(urn).setDeprecated(deprecation.isDeprecated());
return new DatasetDocument().setUrn(urn).setDeprecated(deprecation.isDeprecated());
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull DatasetProperties datasetProperties) {
final DatasetDocument doc = setUrnDerivedFields(urn);
if (datasetProperties.hasDescription()) {
final DatasetDocument doc = new DatasetDocument().setUrn(urn);
if (datasetProperties.getDescription() != null) {
doc.setDescription(datasetProperties.getDescription());
} else {
doc.setDescription("");
}
return doc;
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull SchemaMetadata schemaMetadata) {
return setUrnDerivedFields(urn)
return new DatasetDocument()
.setUrn(urn)
.setHasSchema(true);
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull UpstreamLineage upstreamLineage) {
return setUrnDerivedFields(urn)
.setUpstreams(new DatasetUrnArray(
upstreamLineage.getUpstreams().stream().map(upstream -> upstream.getDataset()).collect(Collectors.toList())
));
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn,
@Nonnull UpstreamLineage upstreamLineage) {
return new DatasetDocument().setUrn(urn)
.setUpstreams(new DatasetUrnArray(upstreamLineage.getUpstreams()
.stream()
.map(upstream -> upstream.getDataset())
.collect(Collectors.toList())));
}
@Nonnull
private List<DatasetDocument> getDocumentsToUpdateFromSnapshotType(@Nonnull DatasetSnapshot datasetSnapshot) {
final DatasetUrn urn = datasetSnapshot.getUrn();
return datasetSnapshot.getAspects().stream().map(aspect -> {
final List<DatasetDocument> documents = datasetSnapshot.getAspects().stream().map(aspect -> {
if (aspect.isDatasetDeprecation()) {
return getDocumentToUpdateFromAspect(urn, aspect.getDatasetDeprecation());
} else if (aspect.isDatasetProperties()) {
@ -111,6 +114,8 @@ public class DatasetIndexBuilder extends BaseIndexBuilder<DatasetDocument> {
}
return null;
}).filter(Objects::nonNull).collect(Collectors.toList());
documents.add(setUrnDerivedFields(urn));
return documents;
}
@Override
@ -123,6 +128,7 @@ public class DatasetIndexBuilder extends BaseIndexBuilder<DatasetDocument> {
}
@Override
@Nonnull
public Class<DatasetDocument> getDocumentType() {
return DatasetDocument.class;
}

View File

@ -39,10 +39,10 @@ public class DataProcessIndexBuilderTest {
new DataProcessSnapshot().setUrn(dataProcessUrn).setAspects(dataProcessAspectArray);
List<DataProcessDocument> actualDocs = new DataProcessIndexBuilder().getDocumentsToUpdate(dataProcessSnapshot);
assertEquals(actualDocs.size(), 1);
assertEquals(actualDocs.get(0).getUrn(), dataProcessUrn);
assertEquals(actualDocs.size(), 2);
assertEquals(actualDocs.get(0).getInputs().get(0), inputDatasetUrn);
assertEquals(actualDocs.get(0).getOutputs().get(0), outputDatasetUrn);
assertEquals(actualDocs.get(0).getUrn(), dataProcessUrn);
assertEquals(actualDocs.get(1).getUrn(), dataProcessUrn);
}
}

View File

@ -29,7 +29,6 @@ import java.util.List;
import org.testng.annotations.Test;
import static org.assertj.core.api.Assertions.*;
import static org.testng.Assert.*;
public class DatasetIndexBuilderTest {
@ -42,18 +41,20 @@ public class DatasetIndexBuilderTest {
final DatasetProperties datasetProperties = new DatasetProperties().setDescription("baz");
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetProperties)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDescription("baz");
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setDescription("baz");
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = _indexBuilder.getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -63,18 +64,20 @@ public class DatasetIndexBuilderTest {
final DatasetProperties datasetProperties = new DatasetProperties();
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetProperties)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setDescription("");
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -84,18 +87,20 @@ public class DatasetIndexBuilderTest {
final DatasetDeprecation datasetDeprecation = new DatasetDeprecation().setDeprecated(true);
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetDeprecation)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDeprecated(true);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setDeprecated(true);
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -105,18 +110,20 @@ public class DatasetIndexBuilderTest {
final DatasetDeprecation datasetDeprecation = new DatasetDeprecation().setDeprecated(false);
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetDeprecation)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDeprecated(false);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setDeprecated(false);
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -129,19 +136,21 @@ public class DatasetIndexBuilderTest {
final Ownership ownership = new Ownership().setOwners(new OwnerArray(owner));
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, ownership)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 =
new DatasetDocument().setUrn(datasetUrn).setHasOwners(true).setOwners(new StringArray("testUser"));
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setHasOwners(true)
.setOwners(new StringArray("testUser"));
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -160,19 +169,22 @@ public class DatasetIndexBuilderTest {
final Ownership ownership = new Ownership().setOwners(new OwnerArray(owner1, owner2, owner3));
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, ownership)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn)
.setHasOwners(true)
.setOwners(new StringArray("testUser1", "testUser2"));
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setHasOwners(true)
.setOwners(new StringArray("testUser1", "testUser2"));
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -182,18 +194,20 @@ public class DatasetIndexBuilderTest {
final SchemaMetadata schemaMetadata = new SchemaMetadata();
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, schemaMetadata)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setHasSchema(true);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setHasSchema(true);
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -203,18 +217,21 @@ public class DatasetIndexBuilderTest {
final Status status = new Status().setRemoved(true);
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, status)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn)
.setRemoved(true);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setRemoved(true);
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -224,18 +241,20 @@ public class DatasetIndexBuilderTest {
final Status status = new Status().setRemoved(false);
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, status)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setRemoved(false);
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setRemoved(false);
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
@Test
@ -249,17 +268,20 @@ public class DatasetIndexBuilderTest {
final UpstreamLineage upstreamLineage = new UpstreamLineage().setUpstreams(new UpstreamArray(upstream1, upstream2));
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, upstreamLineage)));
final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn)
final DatasetDocument expectedDocument1 =
new DatasetDocument().setUrn(datasetUrn).setUpstreams(new DatasetUrnArray(upstreamUrn1, upstreamUrn2));
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)
.setName("bar")
.setPlatform("foo")
.setUpstreams(new DatasetUrnArray(upstreamUrn1, upstreamUrn2));
.setPlatform("foo");
// when
final List<DatasetDocument> actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot);
// then
assertThat(actualDocs).containsExactly(expectedDocument);
assertThat(actualDocs.size()).isEqualTo(2);
assertThat(actualDocs).contains(expectedDocument1);
assertThat(actualDocs).contains(expectedDocument2);
}
}