diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index 2417d9619d..9ac7778154 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -55,7 +55,7 @@ public class SearchableAnnotation { COUNT, DATETIME, OBJECT, - CONTAINER_PATH + BROWSE_PATH_V2 } @Nonnull diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java index 135043a23d..1ab5ff640c 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java @@ -89,7 +89,7 @@ public class EntitySpecBuilderTest { // Assert on Aspect Specs final Map aspectSpecMap = testEntitySpec.getAspectSpecMap(); - assertEquals(4, aspectSpecMap.size()); + assertEquals(5, aspectSpecMap.size()); assertTrue(aspectSpecMap.containsKey("testEntityKey")); assertTrue(aspectSpecMap.containsKey("testBrowsePaths")); assertTrue(aspectSpecMap.containsKey("testEntityInfo")); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 875f3e4dcb..555acb2ffd 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -121,7 +121,15 @@ public class MappingsBuilder { ANALYZER, SLASH_PATTERN_ANALYZER))); mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER); mappingForField.put(FIELDDATA, true); - } else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) { + } else if (fieldType == FieldType.BROWSE_PATH_V2) { + mappingForField.put(TYPE, TEXT); + mappingForField.put(FIELDS, + ImmutableMap.of(LENGTH, ImmutableMap.of( + TYPE, TOKEN_COUNT, + ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER))); + mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER); + mappingForField.put(FIELDDATA, true); + } else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) { mappingForField.put(TYPE, TEXT); mappingForField.put(ANALYZER, URN_ANALYZER); mappingForField.put(SEARCH_ANALYZER, URN_SEARCH_ANALYZER); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 171be34242..e76189dd22 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -54,9 +54,11 @@ public class SettingsBuilder { // Analyzers public static final String BROWSE_PATH_HIERARCHY_ANALYZER = "browse_path_hierarchy"; + public static final String BROWSE_PATH_V2_HIERARCHY_ANALYZER = "browse_path_v2_hierarchy"; public static final String KEYWORD_LOWERCASE_ANALYZER = "custom_keyword"; public static final String PARTIAL_ANALYZER = "partial"; public static final String SLASH_PATTERN_ANALYZER = "slash_pattern"; + public static final String UNIT_SEPARATOR_PATTERN_ANALYZER = "unit_separator_pattern"; public static final String TEXT_ANALYZER = "word_delimited"; public static final String TEXT_SEARCH_ANALYZER = "query_word_delimited"; public static final String KEYWORD_ANALYZER = "keyword"; @@ -102,6 +104,7 @@ public class SettingsBuilder { public static final String MAIN_TOKENIZER = "main_tokenizer"; public static final String PATH_HIERARCHY_TOKENIZER = "path_hierarchy"; public static final String SLASH_TOKENIZER = "slash_tokenizer"; + public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer"; // Do not remove the space, needed for multi-term synonyms public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of( "([a-z0-9 _-]{2,})", @@ -283,6 +286,13 @@ public class SettingsBuilder { .put(PATTERN, "[/]") .build()); + + tokenizers.put(UNIT_SEPARATOR_TOKENIZER, + ImmutableMap.builder() + .put(TYPE, PATTERN) + .put(PATTERN, "[␟]") + .build()); + // Tokenize by whitespace and most special chars tokenizers.put(MAIN_TOKENIZER, ImmutableMap.builder() @@ -313,11 +323,22 @@ public class SettingsBuilder { .put(FILTER, ImmutableList.of(LOWERCASE)) .build()); + // Analyzer for splitting by unit-separator (used to get depth of browsePathV2) + analyzers.put(UNIT_SEPARATOR_PATTERN_ANALYZER, ImmutableMap.builder() + .put(TOKENIZER, UNIT_SEPARATOR_TOKENIZER) + .put(FILTER, ImmutableList.of(LOWERCASE)) + .build()); + // Analyzer for matching browse path analyzers.put(BROWSE_PATH_HIERARCHY_ANALYZER, ImmutableMap.builder() .put(TOKENIZER, PATH_HIERARCHY_TOKENIZER) .build()); + // Analyzer for matching browse path v2 + analyzers.put(BROWSE_PATH_V2_HIERARCHY_ANALYZER, ImmutableMap.builder() + .put(TOKENIZER, PATH_HIERARCHY_TOKENIZER) + .build()); + // Analyzer for case-insensitive exact matching - Only used when building queries analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.builder() .put(TOKENIZER, KEYWORD_TOKENIZER) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java index 2b94784427..fb18b5fc21 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java @@ -11,6 +11,7 @@ import javax.annotation.Nonnull; import java.util.Set; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; @@ -43,6 +44,10 @@ public class SearchFieldConfig { Set.of( SearchableAnnotation.FieldType.BROWSE_PATH ); + private static final Set TYPES_WITH_BROWSE_PATH_V2 = + Set.of( + SearchableAnnotation.FieldType.BROWSE_PATH_V2 + ); private static final Set TYPES_WITH_BASE_KEYWORD = Set.of( SearchableAnnotation.FieldType.TEXT, @@ -125,6 +130,8 @@ public class SearchFieldConfig { // order is important if (TYPES_WITH_BROWSE_PATH.contains(fieldType)) { return BROWSE_PATH_HIERARCHY_ANALYZER; + } else if (TYPES_WITH_BROWSE_PATH_V2.contains(fieldType)) { + return BROWSE_PATH_V2_HIERARCHY_ANALYZER; // sub fields } else if (isKeyword(fieldName)) { return KEYWORD_ANALYZER; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4824d9edec..76f4736f27 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -13,6 +13,8 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; import com.linkedin.metadata.models.extractor.FieldExtractor; + +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; @@ -21,6 +23,8 @@ import java.util.stream.Collectors; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import javax.annotation.Nonnull; + /** * Class that provides a utility function that transforms the snapshot object into a search document @@ -38,6 +42,8 @@ public class SearchDocumentTransformer { // Maximum customProperties value length private final int maxValueLength; + private static final String BROWSE_PATH_V2_DELIMITER = "␟"; + public Optional transformSnapshot(final RecordTemplate snapshot, final EntitySpec entitySpec, final Boolean forDelete) { final Map> extractedSearchableFields = @@ -123,10 +129,15 @@ public class SearchDocumentTransformer { } if (isArray || (valueType == DataSchema.Type.MAP && fieldType != FieldType.OBJECT)) { - ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode(); - fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)) - .forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add)); - searchDocument.set(fieldName, arrayNode); + if (fieldType == FieldType.BROWSE_PATH_V2) { + String browsePathV2Value = getBrowsePathV2Value(fieldValues); + searchDocument.set(fieldName, JsonNodeFactory.instance.textNode(browsePathV2Value)); + } else { + ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode(); + fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)) + .forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add)); + searchDocument.set(fieldName, arrayNode); + } } else if (valueType == DataSchema.Type.MAP) { ObjectNode dictDoc = JsonNodeFactory.instance.objectNode(); fieldValues.subList(0, Math.min(fieldValues.size(), maxObjectKeys)).forEach(fieldValue -> { @@ -197,4 +208,24 @@ public class SearchDocumentTransformer { : Optional.of(JsonNodeFactory.instance.textNode(fieldValue.toString())); } } + + /** + * The browsePathsV2 aspect is a list of objects and the @Searchable annotation specifies a + * list of strings that we receive. However, we want to aggregate those strings and store + * as a single string in ElasticSearch so we can do prefix matching against it. + */ + private String getBrowsePathV2Value(@Nonnull final List fieldValues) { + List stringValues = new ArrayList<>(); + fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)).forEach(value -> { + if (value instanceof String) { + stringValues.add((String) value); + } + }); + String aggregatedValue = String.join(BROWSE_PATH_V2_DELIMITER, stringValues); + // ensure browse path v2 starts with our delimiter if it's not empty + if (!aggregatedValue.equals("") && !aggregatedValue.startsWith(BROWSE_PATH_V2_DELIMITER)) { + aggregatedValue = BROWSE_PATH_V2_DELIMITER + aggregatedValue; + } + return aggregatedValue; + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/TestEntityUtil.java b/metadata-io/src/test/java/com/linkedin/metadata/TestEntityUtil.java index 52375f197b..4b1b8c89b0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/TestEntityUtil.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/TestEntityUtil.java @@ -1,6 +1,9 @@ package com.linkedin.metadata; import com.datahub.test.TestBrowsePaths; +import com.datahub.test.TestBrowsePathsV2; +import com.datahub.test.BrowsePathEntry; +import com.datahub.test.BrowsePathEntryArray; import com.datahub.test.KeyPartEnum; import com.datahub.test.SearchFeatures; import com.datahub.test.SimpleNestedRecord1; @@ -53,12 +56,18 @@ public class TestEntityUtil { snapshot.setUrn(urn); TestBrowsePaths browsePaths = new TestBrowsePaths().setPaths(new StringArray(ImmutableList.of("/a/b/c", "d/e/f"))); + BrowsePathEntryArray browsePathV2Entries = new BrowsePathEntryArray(); + BrowsePathEntry entry1 = new BrowsePathEntry().setId("levelOne"); + BrowsePathEntry entry2 = new BrowsePathEntry().setId("levelTwo"); + browsePathV2Entries.add(entry1); + browsePathV2Entries.add(entry2); + TestBrowsePathsV2 browsePathsV2 = new TestBrowsePathsV2().setPath(browsePathV2Entries); SearchFeatures searchFeatures = new SearchFeatures().setFeature1(2).setFeature2(1); TestEntityAspectArray aspects = new TestEntityAspectArray( ImmutableList.of(TestEntityAspect.create(getTestEntityKey(urn)), TestEntityAspect.create(getTestEntityInfo(urn)), TestEntityAspect.create(browsePaths), - TestEntityAspect.create(searchFeatures))); + TestEntityAspect.create(searchFeatures), TestEntityAspect.create(browsePathsV2))); snapshot.setAspects(aspects); return snapshot; } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index 58c5f1ed70..ed72b46e98 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public class MappingsBuilderTest { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 16); + assertEquals(properties.size(), 17); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -27,6 +27,7 @@ public class MappingsBuilderTest { "analyzer", "partial_urn_component")))); assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword")); assertTrue(properties.containsKey("browsePaths")); + assertTrue(properties.containsKey("browsePathV2")); // KEYWORD Map keyPart3Field = (Map) properties.get("keyPart3"); assertEquals(keyPart3Field.get("type"), "keyword"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index a6e6a3231b..69303ae4ee 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.search.transformer; import com.datahub.test.TestEntitySnapshot; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.JsonNodeFactory; @@ -52,6 +53,8 @@ public class SearchDocumentTransformerTest { assertEquals(browsePaths.get(1).asText(), "d/e/f"); assertEquals(parsedJson.get("feature1").asInt(), 2); assertEquals(parsedJson.get("feature2").asInt(), 1); + JsonNode browsePathV2 = (JsonNode) parsedJson.get("browsePathV2"); + assertEquals(browsePathV2.asText(), "␟levelOne␟levelTwo"); } @Test diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/BrowsePathsV2.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/BrowsePathsV2.pdl index 0aee36b01e..e213fba065 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/common/BrowsePathsV2.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/common/BrowsePathsV2.pdl @@ -15,5 +15,11 @@ record BrowsePathsV2 { * This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers. * These paths should not include high level info captured elsewhere ie. Platform and Environment. */ + @Searchable = { + "/*/id": { + "fieldName": "browsePathV2", + "fieldType": "BROWSE_PATH_V2" + } + } path: array[BrowsePathEntry] } diff --git a/test-models/src/main/pegasus/com/datahub/test/BrowsePathEntry.pdl b/test-models/src/main/pegasus/com/datahub/test/BrowsePathEntry.pdl new file mode 100644 index 0000000000..6204d0b9e7 --- /dev/null +++ b/test-models/src/main/pegasus/com/datahub/test/BrowsePathEntry.pdl @@ -0,0 +1,19 @@ +namespace com.datahub.test + +import com.linkedin.common.Urn + +/** + * Represents a single level in an entity's browse path + */ +record BrowsePathEntry { + /** + * The ID of the browse path entry. This is what gets stored in the index after URL encoding. + * If there's an urn associated with this entry, id and urn will be the same + */ + id: string + + /** + * Optional urn pointing to some entity in DataHub + */ + urn: optional Urn +} diff --git a/test-models/src/main/pegasus/com/datahub/test/TestBrowsePathsV2.pdl b/test-models/src/main/pegasus/com/datahub/test/TestBrowsePathsV2.pdl new file mode 100644 index 0000000000..1172c39b74 --- /dev/null +++ b/test-models/src/main/pegasus/com/datahub/test/TestBrowsePathsV2.pdl @@ -0,0 +1,23 @@ +namespace com.datahub.test + +/** + * Shared aspect containing Browse Paths V2 to be indexed for an entity. + */ +@Aspect = { + "name": "testBrowsePathsV2" +} +record TestBrowsePathsV2 { + /** + * A valid browse path for the entity. This field is provided by DataHub by default. + * + * Browse paths V2 are stored in elasticsearch as unit-separator delimited strings and only include platform specific folders or containers. + * These paths should not include high level info captured elsewhere ie. Platform and Environment. + */ + @Searchable = { + "/*/id": { + "fieldName": "browsePathV2", + "fieldType": "BROWSE_PATH_V2", + } + } + path: array[BrowsePathEntry] +} diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityAspect.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityAspect.pdl index bea9784636..aff922cb11 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityAspect.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityAspect.pdl @@ -7,5 +7,6 @@ typeref TestEntityAspect = union[ TestEntityKey, TestEntityInfo, TestBrowsePaths, - SearchFeatures + SearchFeatures, + TestBrowsePathsV2 ] \ No newline at end of file