feat(es) Store and map containerPath to elastic search properly (#7898)

This commit is contained in:
Chris Collins 2023-05-05 10:49:23 -04:00 committed by GitHub
parent 27c7c40002
commit 37db8c635e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 139 additions and 10 deletions

View File

@ -55,7 +55,7 @@ public class SearchableAnnotation {
COUNT, COUNT,
DATETIME, DATETIME,
OBJECT, OBJECT,
CONTAINER_PATH BROWSE_PATH_V2
} }
@Nonnull @Nonnull

View File

@ -89,7 +89,7 @@ public class EntitySpecBuilderTest {
// Assert on Aspect Specs // Assert on Aspect Specs
final Map<String, AspectSpec> aspectSpecMap = testEntitySpec.getAspectSpecMap(); final Map<String, AspectSpec> aspectSpecMap = testEntitySpec.getAspectSpecMap();
assertEquals(4, aspectSpecMap.size()); assertEquals(5, aspectSpecMap.size());
assertTrue(aspectSpecMap.containsKey("testEntityKey")); assertTrue(aspectSpecMap.containsKey("testEntityKey"));
assertTrue(aspectSpecMap.containsKey("testBrowsePaths")); assertTrue(aspectSpecMap.containsKey("testBrowsePaths"));
assertTrue(aspectSpecMap.containsKey("testEntityInfo")); assertTrue(aspectSpecMap.containsKey("testEntityInfo"));

View File

@ -121,6 +121,14 @@ public class MappingsBuilder {
ANALYZER, SLASH_PATTERN_ANALYZER))); ANALYZER, SLASH_PATTERN_ANALYZER)));
mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER); mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER);
mappingForField.put(FIELDDATA, true); mappingForField.put(FIELDDATA, true);
} else if (fieldType == FieldType.BROWSE_PATH_V2) {
mappingForField.put(TYPE, TEXT);
mappingForField.put(FIELDS,
ImmutableMap.of(LENGTH, ImmutableMap.of(
TYPE, TOKEN_COUNT,
ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER)));
mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER);
mappingForField.put(FIELDDATA, true);
} else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) { } else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) {
mappingForField.put(TYPE, TEXT); mappingForField.put(TYPE, TEXT);
mappingForField.put(ANALYZER, URN_ANALYZER); mappingForField.put(ANALYZER, URN_ANALYZER);

View File

@ -54,9 +54,11 @@ public class SettingsBuilder {
// Analyzers // Analyzers
public static final String BROWSE_PATH_HIERARCHY_ANALYZER = "browse_path_hierarchy"; public static final String BROWSE_PATH_HIERARCHY_ANALYZER = "browse_path_hierarchy";
public static final String BROWSE_PATH_V2_HIERARCHY_ANALYZER = "browse_path_v2_hierarchy";
public static final String KEYWORD_LOWERCASE_ANALYZER = "custom_keyword"; public static final String KEYWORD_LOWERCASE_ANALYZER = "custom_keyword";
public static final String PARTIAL_ANALYZER = "partial"; public static final String PARTIAL_ANALYZER = "partial";
public static final String SLASH_PATTERN_ANALYZER = "slash_pattern"; public static final String SLASH_PATTERN_ANALYZER = "slash_pattern";
public static final String UNIT_SEPARATOR_PATTERN_ANALYZER = "unit_separator_pattern";
public static final String TEXT_ANALYZER = "word_delimited"; public static final String TEXT_ANALYZER = "word_delimited";
public static final String TEXT_SEARCH_ANALYZER = "query_word_delimited"; public static final String TEXT_SEARCH_ANALYZER = "query_word_delimited";
public static final String KEYWORD_ANALYZER = "keyword"; public static final String KEYWORD_ANALYZER = "keyword";
@ -102,6 +104,7 @@ public class SettingsBuilder {
public static final String MAIN_TOKENIZER = "main_tokenizer"; public static final String MAIN_TOKENIZER = "main_tokenizer";
public static final String PATH_HIERARCHY_TOKENIZER = "path_hierarchy"; public static final String PATH_HIERARCHY_TOKENIZER = "path_hierarchy";
public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
// Do not remove the space, needed for multi-term synonyms // Do not remove the space, needed for multi-term synonyms
public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of( public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})", "([a-z0-9 _-]{2,})",
@ -283,6 +286,13 @@ public class SettingsBuilder {
.put(PATTERN, "[/]") .put(PATTERN, "[/]")
.build()); .build());
tokenizers.put(UNIT_SEPARATOR_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[␟]")
.build());
// Tokenize by whitespace and most special chars // Tokenize by whitespace and most special chars
tokenizers.put(MAIN_TOKENIZER, tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.<String, Object>builder() ImmutableMap.<String, Object>builder()
@ -313,11 +323,22 @@ public class SettingsBuilder {
.put(FILTER, ImmutableList.of(LOWERCASE)) .put(FILTER, ImmutableList.of(LOWERCASE))
.build()); .build());
// Analyzer for splitting by unit-separator (used to get depth of browsePathV2)
analyzers.put(UNIT_SEPARATOR_PATTERN_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, UNIT_SEPARATOR_TOKENIZER)
.put(FILTER, ImmutableList.of(LOWERCASE))
.build());
// Analyzer for matching browse path // Analyzer for matching browse path
analyzers.put(BROWSE_PATH_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder() analyzers.put(BROWSE_PATH_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER) .put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
.build()); .build());
// Analyzer for matching browse path v2
analyzers.put(BROWSE_PATH_V2_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
.build());
// Analyzer for case-insensitive exact matching - Only used when building queries // Analyzer for case-insensitive exact matching - Only used when building queries
analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.<String, Object>builder() analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, KEYWORD_TOKENIZER) .put(TOKENIZER, KEYWORD_TOKENIZER)

View File

@ -11,6 +11,7 @@ import javax.annotation.Nonnull;
import java.util.Set; import java.util.Set;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
@ -43,6 +44,10 @@ public class SearchFieldConfig {
Set.of( Set.of(
SearchableAnnotation.FieldType.BROWSE_PATH SearchableAnnotation.FieldType.BROWSE_PATH
); );
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BROWSE_PATH_V2 =
Set.of(
SearchableAnnotation.FieldType.BROWSE_PATH_V2
);
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BASE_KEYWORD = private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BASE_KEYWORD =
Set.of( Set.of(
SearchableAnnotation.FieldType.TEXT, SearchableAnnotation.FieldType.TEXT,
@ -125,6 +130,8 @@ public class SearchFieldConfig {
// order is important // order is important
if (TYPES_WITH_BROWSE_PATH.contains(fieldType)) { if (TYPES_WITH_BROWSE_PATH.contains(fieldType)) {
return BROWSE_PATH_HIERARCHY_ANALYZER; return BROWSE_PATH_HIERARCHY_ANALYZER;
} else if (TYPES_WITH_BROWSE_PATH_V2.contains(fieldType)) {
return BROWSE_PATH_V2_HIERARCHY_ANALYZER;
// sub fields // sub fields
} else if (isKeyword(fieldName)) { } else if (isKeyword(fieldName)) {
return KEYWORD_ANALYZER; return KEYWORD_ANALYZER;

View File

@ -13,6 +13,8 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec;
import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.SearchableFieldSpec;
import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType;
import com.linkedin.metadata.models.extractor.FieldExtractor; import com.linkedin.metadata.models.extractor.FieldExtractor;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
@ -21,6 +23,8 @@ import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import javax.annotation.Nonnull;
/** /**
* Class that provides a utility function that transforms the snapshot object into a search document * Class that provides a utility function that transforms the snapshot object into a search document
@ -38,6 +42,8 @@ public class SearchDocumentTransformer {
// Maximum customProperties value length // Maximum customProperties value length
private final int maxValueLength; private final int maxValueLength;
private static final String BROWSE_PATH_V2_DELIMITER = "";
public Optional<String> transformSnapshot(final RecordTemplate snapshot, final EntitySpec entitySpec, public Optional<String> transformSnapshot(final RecordTemplate snapshot, final EntitySpec entitySpec,
final Boolean forDelete) { final Boolean forDelete) {
final Map<SearchableFieldSpec, List<Object>> extractedSearchableFields = final Map<SearchableFieldSpec, List<Object>> extractedSearchableFields =
@ -123,10 +129,15 @@ public class SearchDocumentTransformer {
} }
if (isArray || (valueType == DataSchema.Type.MAP && fieldType != FieldType.OBJECT)) { if (isArray || (valueType == DataSchema.Type.MAP && fieldType != FieldType.OBJECT)) {
if (fieldType == FieldType.BROWSE_PATH_V2) {
String browsePathV2Value = getBrowsePathV2Value(fieldValues);
searchDocument.set(fieldName, JsonNodeFactory.instance.textNode(browsePathV2Value));
} else {
ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode(); ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode();
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)) fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength))
.forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add)); .forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add));
searchDocument.set(fieldName, arrayNode); searchDocument.set(fieldName, arrayNode);
}
} else if (valueType == DataSchema.Type.MAP) { } else if (valueType == DataSchema.Type.MAP) {
ObjectNode dictDoc = JsonNodeFactory.instance.objectNode(); ObjectNode dictDoc = JsonNodeFactory.instance.objectNode();
fieldValues.subList(0, Math.min(fieldValues.size(), maxObjectKeys)).forEach(fieldValue -> { fieldValues.subList(0, Math.min(fieldValues.size(), maxObjectKeys)).forEach(fieldValue -> {
@ -197,4 +208,24 @@ public class SearchDocumentTransformer {
: Optional.of(JsonNodeFactory.instance.textNode(fieldValue.toString())); : Optional.of(JsonNodeFactory.instance.textNode(fieldValue.toString()));
} }
} }
/**
* The browsePathsV2 aspect is a list of objects and the @Searchable annotation specifies a
* list of strings that we receive. However, we want to aggregate those strings and store
* as a single string in ElasticSearch so we can do prefix matching against it.
*/
private String getBrowsePathV2Value(@Nonnull final List<Object> fieldValues) {
List<String> stringValues = new ArrayList<>();
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)).forEach(value -> {
if (value instanceof String) {
stringValues.add((String) value);
}
});
String aggregatedValue = String.join(BROWSE_PATH_V2_DELIMITER, stringValues);
// ensure browse path v2 starts with our delimiter if it's not empty
if (!aggregatedValue.equals("") && !aggregatedValue.startsWith(BROWSE_PATH_V2_DELIMITER)) {
aggregatedValue = BROWSE_PATH_V2_DELIMITER + aggregatedValue;
}
return aggregatedValue;
}
} }

View File

@ -1,6 +1,9 @@
package com.linkedin.metadata; package com.linkedin.metadata;
import com.datahub.test.TestBrowsePaths; import com.datahub.test.TestBrowsePaths;
import com.datahub.test.TestBrowsePathsV2;
import com.datahub.test.BrowsePathEntry;
import com.datahub.test.BrowsePathEntryArray;
import com.datahub.test.KeyPartEnum; import com.datahub.test.KeyPartEnum;
import com.datahub.test.SearchFeatures; import com.datahub.test.SearchFeatures;
import com.datahub.test.SimpleNestedRecord1; import com.datahub.test.SimpleNestedRecord1;
@ -53,12 +56,18 @@ public class TestEntityUtil {
snapshot.setUrn(urn); snapshot.setUrn(urn);
TestBrowsePaths browsePaths = new TestBrowsePaths().setPaths(new StringArray(ImmutableList.of("/a/b/c", "d/e/f"))); TestBrowsePaths browsePaths = new TestBrowsePaths().setPaths(new StringArray(ImmutableList.of("/a/b/c", "d/e/f")));
BrowsePathEntryArray browsePathV2Entries = new BrowsePathEntryArray();
BrowsePathEntry entry1 = new BrowsePathEntry().setId("levelOne");
BrowsePathEntry entry2 = new BrowsePathEntry().setId("levelTwo");
browsePathV2Entries.add(entry1);
browsePathV2Entries.add(entry2);
TestBrowsePathsV2 browsePathsV2 = new TestBrowsePathsV2().setPath(browsePathV2Entries);
SearchFeatures searchFeatures = new SearchFeatures().setFeature1(2).setFeature2(1); SearchFeatures searchFeatures = new SearchFeatures().setFeature1(2).setFeature2(1);
TestEntityAspectArray aspects = new TestEntityAspectArray( TestEntityAspectArray aspects = new TestEntityAspectArray(
ImmutableList.of(TestEntityAspect.create(getTestEntityKey(urn)), ImmutableList.of(TestEntityAspect.create(getTestEntityKey(urn)),
TestEntityAspect.create(getTestEntityInfo(urn)), TestEntityAspect.create(browsePaths), TestEntityAspect.create(getTestEntityInfo(urn)), TestEntityAspect.create(browsePaths),
TestEntityAspect.create(searchFeatures))); TestEntityAspect.create(searchFeatures), TestEntityAspect.create(browsePathsV2)));
snapshot.setAspects(aspects); snapshot.setAspects(aspects);
return snapshot; return snapshot;
} }

View File

@ -16,7 +16,7 @@ public class MappingsBuilderTest {
Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
assertEquals(result.size(), 1); assertEquals(result.size(), 1);
Map<String, Object> properties = (Map<String, Object>) result.get("properties"); Map<String, Object> properties = (Map<String, Object>) result.get("properties");
assertEquals(properties.size(), 16); assertEquals(properties.size(), 17);
assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
"fields", "fields",
ImmutableMap.of("delimited", ImmutableMap.of("delimited",
@ -27,6 +27,7 @@ public class MappingsBuilderTest {
"analyzer", "partial_urn_component")))); "analyzer", "partial_urn_component"))));
assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword")); assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword"));
assertTrue(properties.containsKey("browsePaths")); assertTrue(properties.containsKey("browsePaths"));
assertTrue(properties.containsKey("browsePathV2"));
// KEYWORD // KEYWORD
Map<String, Object> keyPart3Field = (Map<String, Object>) properties.get("keyPart3"); Map<String, Object> keyPart3Field = (Map<String, Object>) properties.get("keyPart3");
assertEquals(keyPart3Field.get("type"), "keyword"); assertEquals(keyPart3Field.get("type"), "keyword");

View File

@ -1,6 +1,7 @@
package com.linkedin.metadata.search.transformer; package com.linkedin.metadata.search.transformer;
import com.datahub.test.TestEntitySnapshot; import com.datahub.test.TestEntitySnapshot;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.JsonNodeFactory;
@ -52,6 +53,8 @@ public class SearchDocumentTransformerTest {
assertEquals(browsePaths.get(1).asText(), "d/e/f"); assertEquals(browsePaths.get(1).asText(), "d/e/f");
assertEquals(parsedJson.get("feature1").asInt(), 2); assertEquals(parsedJson.get("feature1").asInt(), 2);
assertEquals(parsedJson.get("feature2").asInt(), 1); assertEquals(parsedJson.get("feature2").asInt(), 1);
JsonNode browsePathV2 = (JsonNode) parsedJson.get("browsePathV2");
assertEquals(browsePathV2.asText(), "␟levelOne␟levelTwo");
} }
@Test @Test

View File

@ -15,5 +15,11 @@ record BrowsePathsV2 {
* This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers. * This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers.
* These paths should not include high level info captured elsewhere ie. Platform and Environment. * These paths should not include high level info captured elsewhere ie. Platform and Environment.
*/ */
@Searchable = {
"/*/id": {
"fieldName": "browsePathV2",
"fieldType": "BROWSE_PATH_V2"
}
}
path: array[BrowsePathEntry] path: array[BrowsePathEntry]
} }

View File

@ -0,0 +1,19 @@
namespace com.datahub.test
import com.linkedin.common.Urn
/**
* Represents a single level in an entity's browse path
*/
record BrowsePathEntry {
/**
* The ID of the browse path entry. This is what gets stored in the index after URL encoding.
* If there's an urn associated with this entry, id and urn will be the same
*/
id: string
/**
* Optional urn pointing to some entity in DataHub
*/
urn: optional Urn
}

View File

@ -0,0 +1,23 @@
namespace com.datahub.test
/**
* Shared aspect containing Browse Paths V2 to be indexed for an entity.
*/
@Aspect = {
"name": "testBrowsePathsV2"
}
record TestBrowsePathsV2 {
/**
* A valid browse path for the entity. This field is provided by DataHub by default.
*
* Browse paths V2 are stored in elasticsearch as unit-separator delimited strings and only include platform specific folders or containers.
* These paths should not include high level info captured elsewhere ie. Platform and Environment.
*/
@Searchable = {
"/*/id": {
"fieldName": "browsePathV2",
"fieldType": "BROWSE_PATH_V2",
}
}
path: array[BrowsePathEntry]
}

View File

@ -7,5 +7,6 @@ typeref TestEntityAspect = union[
TestEntityKey, TestEntityKey,
TestEntityInfo, TestEntityInfo,
TestBrowsePaths, TestBrowsePaths,
SearchFeatures SearchFeatures,
TestBrowsePathsV2
] ]