feat(es) Store and map containerPath to elastic search properly (#7898)

This commit is contained in:
Chris Collins 2023-05-05 10:49:23 -04:00 committed by GitHub
parent 27c7c40002
commit 37db8c635e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 139 additions and 10 deletions

View File

@ -55,7 +55,7 @@ public class SearchableAnnotation {
COUNT,
DATETIME,
OBJECT,
CONTAINER_PATH
BROWSE_PATH_V2
}
@Nonnull

View File

@ -89,7 +89,7 @@ public class EntitySpecBuilderTest {
// Assert on Aspect Specs
final Map<String, AspectSpec> aspectSpecMap = testEntitySpec.getAspectSpecMap();
assertEquals(4, aspectSpecMap.size());
assertEquals(5, aspectSpecMap.size());
assertTrue(aspectSpecMap.containsKey("testEntityKey"));
assertTrue(aspectSpecMap.containsKey("testBrowsePaths"));
assertTrue(aspectSpecMap.containsKey("testEntityInfo"));

View File

@ -121,7 +121,15 @@ public class MappingsBuilder {
ANALYZER, SLASH_PATTERN_ANALYZER)));
mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER);
mappingForField.put(FIELDDATA, true);
} else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) {
} else if (fieldType == FieldType.BROWSE_PATH_V2) {
mappingForField.put(TYPE, TEXT);
mappingForField.put(FIELDS,
ImmutableMap.of(LENGTH, ImmutableMap.of(
TYPE, TOKEN_COUNT,
ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER)));
mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER);
mappingForField.put(FIELDDATA, true);
} else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) {
mappingForField.put(TYPE, TEXT);
mappingForField.put(ANALYZER, URN_ANALYZER);
mappingForField.put(SEARCH_ANALYZER, URN_SEARCH_ANALYZER);

View File

@ -54,9 +54,11 @@ public class SettingsBuilder {
// Analyzers
public static final String BROWSE_PATH_HIERARCHY_ANALYZER = "browse_path_hierarchy";
public static final String BROWSE_PATH_V2_HIERARCHY_ANALYZER = "browse_path_v2_hierarchy";
public static final String KEYWORD_LOWERCASE_ANALYZER = "custom_keyword";
public static final String PARTIAL_ANALYZER = "partial";
public static final String SLASH_PATTERN_ANALYZER = "slash_pattern";
public static final String UNIT_SEPARATOR_PATTERN_ANALYZER = "unit_separator_pattern";
public static final String TEXT_ANALYZER = "word_delimited";
public static final String TEXT_SEARCH_ANALYZER = "query_word_delimited";
public static final String KEYWORD_ANALYZER = "keyword";
@ -102,6 +104,7 @@ public class SettingsBuilder {
public static final String MAIN_TOKENIZER = "main_tokenizer";
public static final String PATH_HIERARCHY_TOKENIZER = "path_hierarchy";
public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
// Do not remove the space, needed for multi-term synonyms
public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})",
@ -283,6 +286,13 @@ public class SettingsBuilder {
.put(PATTERN, "[/]")
.build());
tokenizers.put(UNIT_SEPARATOR_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[␟]")
.build());
// Tokenize by whitespace and most special chars
tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.<String, Object>builder()
@ -313,11 +323,22 @@ public class SettingsBuilder {
.put(FILTER, ImmutableList.of(LOWERCASE))
.build());
// Analyzer for splitting by unit-separator (used to get depth of browsePathV2)
analyzers.put(UNIT_SEPARATOR_PATTERN_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, UNIT_SEPARATOR_TOKENIZER)
.put(FILTER, ImmutableList.of(LOWERCASE))
.build());
// Analyzer for matching browse path
analyzers.put(BROWSE_PATH_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
.build());
// Analyzer for matching browse path v2
analyzers.put(BROWSE_PATH_V2_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
.build());
// Analyzer for case-insensitive exact matching - Only used when building queries
analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, KEYWORD_TOKENIZER)

View File

@ -11,6 +11,7 @@ import javax.annotation.Nonnull;
import java.util.Set;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
@ -43,6 +44,10 @@ public class SearchFieldConfig {
Set.of(
SearchableAnnotation.FieldType.BROWSE_PATH
);
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BROWSE_PATH_V2 =
Set.of(
SearchableAnnotation.FieldType.BROWSE_PATH_V2
);
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BASE_KEYWORD =
Set.of(
SearchableAnnotation.FieldType.TEXT,
@ -125,6 +130,8 @@ public class SearchFieldConfig {
// order is important
if (TYPES_WITH_BROWSE_PATH.contains(fieldType)) {
return BROWSE_PATH_HIERARCHY_ANALYZER;
} else if (TYPES_WITH_BROWSE_PATH_V2.contains(fieldType)) {
return BROWSE_PATH_V2_HIERARCHY_ANALYZER;
// sub fields
} else if (isKeyword(fieldName)) {
return KEYWORD_ANALYZER;

View File

@ -13,6 +13,8 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec;
import com.linkedin.metadata.models.SearchableFieldSpec;
import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType;
import com.linkedin.metadata.models.extractor.FieldExtractor;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@ -21,6 +23,8 @@ import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import javax.annotation.Nonnull;
/**
* Class that provides a utility function that transforms the snapshot object into a search document
@ -38,6 +42,8 @@ public class SearchDocumentTransformer {
// Maximum customProperties value length
private final int maxValueLength;
private static final String BROWSE_PATH_V2_DELIMITER = "";
public Optional<String> transformSnapshot(final RecordTemplate snapshot, final EntitySpec entitySpec,
final Boolean forDelete) {
final Map<SearchableFieldSpec, List<Object>> extractedSearchableFields =
@ -123,10 +129,15 @@ public class SearchDocumentTransformer {
}
if (isArray || (valueType == DataSchema.Type.MAP && fieldType != FieldType.OBJECT)) {
ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode();
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength))
.forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add));
searchDocument.set(fieldName, arrayNode);
if (fieldType == FieldType.BROWSE_PATH_V2) {
String browsePathV2Value = getBrowsePathV2Value(fieldValues);
searchDocument.set(fieldName, JsonNodeFactory.instance.textNode(browsePathV2Value));
} else {
ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode();
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength))
.forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add));
searchDocument.set(fieldName, arrayNode);
}
} else if (valueType == DataSchema.Type.MAP) {
ObjectNode dictDoc = JsonNodeFactory.instance.objectNode();
fieldValues.subList(0, Math.min(fieldValues.size(), maxObjectKeys)).forEach(fieldValue -> {
@ -197,4 +208,24 @@ public class SearchDocumentTransformer {
: Optional.of(JsonNodeFactory.instance.textNode(fieldValue.toString()));
}
}
/**
* The browsePathsV2 aspect is a list of objects and the @Searchable annotation specifies a
* list of strings that we receive. However, we want to aggregate those strings and store
* as a single string in ElasticSearch so we can do prefix matching against it.
*/
private String getBrowsePathV2Value(@Nonnull final List<Object> fieldValues) {
List<String> stringValues = new ArrayList<>();
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)).forEach(value -> {
if (value instanceof String) {
stringValues.add((String) value);
}
});
String aggregatedValue = String.join(BROWSE_PATH_V2_DELIMITER, stringValues);
// ensure browse path v2 starts with our delimiter if it's not empty
if (!aggregatedValue.equals("") && !aggregatedValue.startsWith(BROWSE_PATH_V2_DELIMITER)) {
aggregatedValue = BROWSE_PATH_V2_DELIMITER + aggregatedValue;
}
return aggregatedValue;
}
}

View File

@ -1,6 +1,9 @@
package com.linkedin.metadata;
import com.datahub.test.TestBrowsePaths;
import com.datahub.test.TestBrowsePathsV2;
import com.datahub.test.BrowsePathEntry;
import com.datahub.test.BrowsePathEntryArray;
import com.datahub.test.KeyPartEnum;
import com.datahub.test.SearchFeatures;
import com.datahub.test.SimpleNestedRecord1;
@ -53,12 +56,18 @@ public class TestEntityUtil {
snapshot.setUrn(urn);
TestBrowsePaths browsePaths = new TestBrowsePaths().setPaths(new StringArray(ImmutableList.of("/a/b/c", "d/e/f")));
BrowsePathEntryArray browsePathV2Entries = new BrowsePathEntryArray();
BrowsePathEntry entry1 = new BrowsePathEntry().setId("levelOne");
BrowsePathEntry entry2 = new BrowsePathEntry().setId("levelTwo");
browsePathV2Entries.add(entry1);
browsePathV2Entries.add(entry2);
TestBrowsePathsV2 browsePathsV2 = new TestBrowsePathsV2().setPath(browsePathV2Entries);
SearchFeatures searchFeatures = new SearchFeatures().setFeature1(2).setFeature2(1);
TestEntityAspectArray aspects = new TestEntityAspectArray(
ImmutableList.of(TestEntityAspect.create(getTestEntityKey(urn)),
TestEntityAspect.create(getTestEntityInfo(urn)), TestEntityAspect.create(browsePaths),
TestEntityAspect.create(searchFeatures)));
TestEntityAspect.create(searchFeatures), TestEntityAspect.create(browsePathsV2)));
snapshot.setAspects(aspects);
return snapshot;
}

View File

@ -16,7 +16,7 @@ public class MappingsBuilderTest {
Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
assertEquals(result.size(), 1);
Map<String, Object> properties = (Map<String, Object>) result.get("properties");
assertEquals(properties.size(), 16);
assertEquals(properties.size(), 17);
assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
"fields",
ImmutableMap.of("delimited",
@ -27,6 +27,7 @@ public class MappingsBuilderTest {
"analyzer", "partial_urn_component"))));
assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword"));
assertTrue(properties.containsKey("browsePaths"));
assertTrue(properties.containsKey("browsePathV2"));
// KEYWORD
Map<String, Object> keyPart3Field = (Map<String, Object>) properties.get("keyPart3");
assertEquals(keyPart3Field.get("type"), "keyword");

View File

@ -1,6 +1,7 @@
package com.linkedin.metadata.search.transformer;
import com.datahub.test.TestEntitySnapshot;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
@ -52,6 +53,8 @@ public class SearchDocumentTransformerTest {
assertEquals(browsePaths.get(1).asText(), "d/e/f");
assertEquals(parsedJson.get("feature1").asInt(), 2);
assertEquals(parsedJson.get("feature2").asInt(), 1);
JsonNode browsePathV2 = (JsonNode) parsedJson.get("browsePathV2");
assertEquals(browsePathV2.asText(), "␟levelOne␟levelTwo");
}
@Test

View File

@ -15,5 +15,11 @@ record BrowsePathsV2 {
* This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers.
* These paths should not include high level info captured elsewhere ie. Platform and Environment.
*/
@Searchable = {
"/*/id": {
"fieldName": "browsePathV2",
"fieldType": "BROWSE_PATH_V2"
}
}
path: array[BrowsePathEntry]
}

View File

@ -0,0 +1,19 @@
namespace com.datahub.test
import com.linkedin.common.Urn
/**
* Represents a single level in an entity's browse path
*/
record BrowsePathEntry {
/**
* The ID of the browse path entry. This is what gets stored in the index after URL encoding.
* If there's an urn associated with this entry, id and urn will be the same
*/
id: string
/**
* Optional urn pointing to some entity in DataHub
*/
urn: optional Urn
}

View File

@ -0,0 +1,23 @@
namespace com.datahub.test
/**
* Shared aspect containing Browse Paths V2 to be indexed for an entity.
*/
@Aspect = {
"name": "testBrowsePathsV2"
}
record TestBrowsePathsV2 {
/**
* A valid browse path for the entity. This field is provided by DataHub by default.
*
* Browse paths V2 are stored in elasticsearch as unit-separator delimited strings and only include platform specific folders or containers.
* These paths should not include high level info captured elsewhere ie. Platform and Environment.
*/
@Searchable = {
"/*/id": {
"fieldName": "browsePathV2",
"fieldType": "BROWSE_PATH_V2",
}
}
path: array[BrowsePathEntry]
}

View File

@ -7,5 +7,6 @@ typeref TestEntityAspect = union[
TestEntityKey,
TestEntityInfo,
TestBrowsePaths,
SearchFeatures
SearchFeatures,
TestBrowsePathsV2
]