From 0c92a8e88783d943c1fbd0b2d67bcf44aabee7df Mon Sep 17 00:00:00 2001 From: Jyoti Wadhwani Date: Thu, 29 Oct 2020 09:52:41 -0700 Subject: [PATCH] refactor search index builder to store urn parts efficiently (#1937) (#1972) * refactor search index builder to store urn parts efficiently (#1937) Co-authored-by: Jyoti Wadhwani * set urn for all documents * rebase, fix merge conflicts and modify tests Co-authored-by: Jyoti Wadhwani --- .../search/DataProcessIndexBuilder.java | 18 ++- .../builders/search/DatasetIndexBuilder.java | 34 +++--- .../search/DataProcessIndexBuilderTest.java | 6 +- .../search/DatasetIndexBuilderTest.java | 108 +++++++++++------- 4 files changed, 95 insertions(+), 71 deletions(-) diff --git a/metadata-builders/src/main/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilder.java b/metadata-builders/src/main/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilder.java index a2c33d602a..7612abfa71 100644 --- a/metadata-builders/src/main/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilder.java +++ b/metadata-builders/src/main/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilder.java @@ -1,7 +1,6 @@ package com.linkedin.metadata.builders.search; import com.linkedin.common.Ownership; -import com.linkedin.common.Status; import com.linkedin.common.urn.DataProcessUrn; import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.StringArray; @@ -42,7 +41,8 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder getDocumentsToUpdateFromSnapshotType(@Nonnull DataProcessSnapshot dataProcessSnapshot) { - DataProcessUrn urn = dataProcessSnapshot.getUrn(); - return dataProcessSnapshot.getAspects().stream().map(aspect -> { + final DataProcessUrn urn = dataProcessSnapshot.getUrn(); + final List documents = dataProcessSnapshot.getAspects().stream().map(aspect -> { if (aspect.isDataProcessInfo()) { return getDocumentToUpdateFromAspect(urn, aspect.getDataProcessInfo()); } else if (aspect.isOwnership()) { @@ -79,6 +73,8 @@ public class DataProcessIndexBuilder extends BaseIndexBuilder { @Nonnull private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull Ownership ownership) { final StringArray owners = BuilderUtils.getCorpUserOwners(ownership); - return setUrnDerivedFields(urn) + return new DatasetDocument() + .setUrn(urn) .setHasOwners(!owners.isEmpty()) .setOwners(owners); } @Nonnull private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull Status status) { - return setUrnDerivedFields(urn) + return new DatasetDocument() + .setUrn(urn) .setRemoved(status.isRemoved()); } @Nonnull private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull DatasetDeprecation deprecation) { - return setUrnDerivedFields(urn).setDeprecated(deprecation.isDeprecated()); + return new DatasetDocument().setUrn(urn).setDeprecated(deprecation.isDeprecated()); } @Nonnull private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull DatasetProperties datasetProperties) { - final DatasetDocument doc = setUrnDerivedFields(urn); - if (datasetProperties.hasDescription()) { + final DatasetDocument doc = new DatasetDocument().setUrn(urn); + if (datasetProperties.getDescription() != null) { doc.setDescription(datasetProperties.getDescription()); - } else { - doc.setDescription(""); } return doc; } @Nonnull private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull SchemaMetadata schemaMetadata) { - return setUrnDerivedFields(urn) + return new DatasetDocument() + .setUrn(urn) .setHasSchema(true); } @Nonnull - private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull UpstreamLineage upstreamLineage) { - return setUrnDerivedFields(urn) - .setUpstreams(new DatasetUrnArray( - upstreamLineage.getUpstreams().stream().map(upstream -> upstream.getDataset()).collect(Collectors.toList()) - )); + private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, + @Nonnull UpstreamLineage upstreamLineage) { + return new DatasetDocument().setUrn(urn) + .setUpstreams(new DatasetUrnArray(upstreamLineage.getUpstreams() + .stream() + .map(upstream -> upstream.getDataset()) + .collect(Collectors.toList()))); } @Nonnull private List getDocumentsToUpdateFromSnapshotType(@Nonnull DatasetSnapshot datasetSnapshot) { final DatasetUrn urn = datasetSnapshot.getUrn(); - return datasetSnapshot.getAspects().stream().map(aspect -> { + final List documents = datasetSnapshot.getAspects().stream().map(aspect -> { if (aspect.isDatasetDeprecation()) { return getDocumentToUpdateFromAspect(urn, aspect.getDatasetDeprecation()); } else if (aspect.isDatasetProperties()) { @@ -111,6 +114,8 @@ public class DatasetIndexBuilder extends BaseIndexBuilder { } return null; }).filter(Objects::nonNull).collect(Collectors.toList()); + documents.add(setUrnDerivedFields(urn)); + return documents; } @Override @@ -123,6 +128,7 @@ public class DatasetIndexBuilder extends BaseIndexBuilder { } @Override + @Nonnull public Class getDocumentType() { return DatasetDocument.class; } diff --git a/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilderTest.java b/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilderTest.java index e84374cf06..f8ca80b9c7 100644 --- a/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilderTest.java +++ b/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DataProcessIndexBuilderTest.java @@ -39,10 +39,10 @@ public class DataProcessIndexBuilderTest { new DataProcessSnapshot().setUrn(dataProcessUrn).setAspects(dataProcessAspectArray); List actualDocs = new DataProcessIndexBuilder().getDocumentsToUpdate(dataProcessSnapshot); - assertEquals(actualDocs.size(), 1); - assertEquals(actualDocs.get(0).getUrn(), dataProcessUrn); + assertEquals(actualDocs.size(), 2); assertEquals(actualDocs.get(0).getInputs().get(0), inputDatasetUrn); assertEquals(actualDocs.get(0).getOutputs().get(0), outputDatasetUrn); - + assertEquals(actualDocs.get(0).getUrn(), dataProcessUrn); + assertEquals(actualDocs.get(1).getUrn(), dataProcessUrn); } } diff --git a/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DatasetIndexBuilderTest.java b/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DatasetIndexBuilderTest.java index ddf3168f0d..bebc865430 100644 --- a/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DatasetIndexBuilderTest.java +++ b/metadata-builders/src/test/java/com/linkedin/metadata/builders/search/DatasetIndexBuilderTest.java @@ -29,7 +29,6 @@ import java.util.List; import org.testng.annotations.Test; import static org.assertj.core.api.Assertions.*; -import static org.testng.Assert.*; public class DatasetIndexBuilderTest { @@ -42,18 +41,20 @@ public class DatasetIndexBuilderTest { final DatasetProperties datasetProperties = new DatasetProperties().setDescription("baz"); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetProperties))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDescription("baz"); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setDescription("baz"); + .setPlatform("foo"); // when final List actualDocs = _indexBuilder.getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -63,18 +64,20 @@ public class DatasetIndexBuilderTest { final DatasetProperties datasetProperties = new DatasetProperties(); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetProperties))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setDescription(""); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -84,18 +87,20 @@ public class DatasetIndexBuilderTest { final DatasetDeprecation datasetDeprecation = new DatasetDeprecation().setDeprecated(true); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetDeprecation))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDeprecated(true); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setDeprecated(true); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -105,18 +110,20 @@ public class DatasetIndexBuilderTest { final DatasetDeprecation datasetDeprecation = new DatasetDeprecation().setDeprecated(false); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, datasetDeprecation))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setDeprecated(false); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setDeprecated(false); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -129,19 +136,21 @@ public class DatasetIndexBuilderTest { final Ownership ownership = new Ownership().setOwners(new OwnerArray(owner)); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, ownership))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = + new DatasetDocument().setUrn(datasetUrn).setHasOwners(true).setOwners(new StringArray("testUser")); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setHasOwners(true) - .setOwners(new StringArray("testUser")); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -160,19 +169,22 @@ public class DatasetIndexBuilderTest { final Ownership ownership = new Ownership().setOwners(new OwnerArray(owner1, owner2, owner3)); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, ownership))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn) + .setHasOwners(true) + .setOwners(new StringArray("testUser1", "testUser2")); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setHasOwners(true) - .setOwners(new StringArray("testUser1", "testUser2")); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -182,18 +194,20 @@ public class DatasetIndexBuilderTest { final SchemaMetadata schemaMetadata = new SchemaMetadata(); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, schemaMetadata))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setHasSchema(true); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setHasSchema(true); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -203,18 +217,21 @@ public class DatasetIndexBuilderTest { final Status status = new Status().setRemoved(true); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, status))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn) + .setRemoved(true); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setRemoved(true); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -224,18 +241,20 @@ public class DatasetIndexBuilderTest { final Status status = new Status().setRemoved(false); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, status))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setRemoved(false); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setRemoved(false); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } @Test @@ -249,17 +268,20 @@ public class DatasetIndexBuilderTest { final UpstreamLineage upstreamLineage = new UpstreamLineage().setUpstreams(new UpstreamArray(upstream1, upstream2)); final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, upstreamLineage))); - final DatasetDocument expectedDocument = new DatasetDocument().setUrn(datasetUrn) + final DatasetDocument expectedDocument1 = + new DatasetDocument().setUrn(datasetUrn).setUpstreams(new DatasetUrnArray(upstreamUrn1, upstreamUrn2)); + final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn) .setBrowsePaths(new StringArray("/prod/foo/bar")) .setOrigin(FabricType.PROD) .setName("bar") - .setPlatform("foo") - .setUpstreams(new DatasetUrnArray(upstreamUrn1, upstreamUrn2)); + .setPlatform("foo"); // when final List actualDocs = new DatasetIndexBuilder().getDocumentsToUpdate(datasetSnapshot); // then - assertThat(actualDocs).containsExactly(expectedDocument); + assertThat(actualDocs.size()).isEqualTo(2); + assertThat(actualDocs).contains(expectedDocument1); + assertThat(actualDocs).contains(expectedDocument2); } } \ No newline at end of file