mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-01 20:14:13 +00:00
feat(es) Store and map containerPath to elastic search properly (#7898)
This commit is contained in:
parent
27c7c40002
commit
37db8c635e
@ -55,7 +55,7 @@ public class SearchableAnnotation {
|
||||
COUNT,
|
||||
DATETIME,
|
||||
OBJECT,
|
||||
CONTAINER_PATH
|
||||
BROWSE_PATH_V2
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
|
@ -89,7 +89,7 @@ public class EntitySpecBuilderTest {
|
||||
|
||||
// Assert on Aspect Specs
|
||||
final Map<String, AspectSpec> aspectSpecMap = testEntitySpec.getAspectSpecMap();
|
||||
assertEquals(4, aspectSpecMap.size());
|
||||
assertEquals(5, aspectSpecMap.size());
|
||||
assertTrue(aspectSpecMap.containsKey("testEntityKey"));
|
||||
assertTrue(aspectSpecMap.containsKey("testBrowsePaths"));
|
||||
assertTrue(aspectSpecMap.containsKey("testEntityInfo"));
|
||||
|
@ -121,6 +121,14 @@ public class MappingsBuilder {
|
||||
ANALYZER, SLASH_PATTERN_ANALYZER)));
|
||||
mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER);
|
||||
mappingForField.put(FIELDDATA, true);
|
||||
} else if (fieldType == FieldType.BROWSE_PATH_V2) {
|
||||
mappingForField.put(TYPE, TEXT);
|
||||
mappingForField.put(FIELDS,
|
||||
ImmutableMap.of(LENGTH, ImmutableMap.of(
|
||||
TYPE, TOKEN_COUNT,
|
||||
ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER)));
|
||||
mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER);
|
||||
mappingForField.put(FIELDDATA, true);
|
||||
} else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) {
|
||||
mappingForField.put(TYPE, TEXT);
|
||||
mappingForField.put(ANALYZER, URN_ANALYZER);
|
||||
|
@ -54,9 +54,11 @@ public class SettingsBuilder {
|
||||
|
||||
// Analyzers
|
||||
public static final String BROWSE_PATH_HIERARCHY_ANALYZER = "browse_path_hierarchy";
|
||||
public static final String BROWSE_PATH_V2_HIERARCHY_ANALYZER = "browse_path_v2_hierarchy";
|
||||
public static final String KEYWORD_LOWERCASE_ANALYZER = "custom_keyword";
|
||||
public static final String PARTIAL_ANALYZER = "partial";
|
||||
public static final String SLASH_PATTERN_ANALYZER = "slash_pattern";
|
||||
public static final String UNIT_SEPARATOR_PATTERN_ANALYZER = "unit_separator_pattern";
|
||||
public static final String TEXT_ANALYZER = "word_delimited";
|
||||
public static final String TEXT_SEARCH_ANALYZER = "query_word_delimited";
|
||||
public static final String KEYWORD_ANALYZER = "keyword";
|
||||
@ -102,6 +104,7 @@ public class SettingsBuilder {
|
||||
public static final String MAIN_TOKENIZER = "main_tokenizer";
|
||||
public static final String PATH_HIERARCHY_TOKENIZER = "path_hierarchy";
|
||||
public static final String SLASH_TOKENIZER = "slash_tokenizer";
|
||||
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
|
||||
// Do not remove the space, needed for multi-term synonyms
|
||||
public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
|
||||
"([a-z0-9 _-]{2,})",
|
||||
@ -283,6 +286,13 @@ public class SettingsBuilder {
|
||||
.put(PATTERN, "[/]")
|
||||
.build());
|
||||
|
||||
|
||||
tokenizers.put(UNIT_SEPARATOR_TOKENIZER,
|
||||
ImmutableMap.<String, Object>builder()
|
||||
.put(TYPE, PATTERN)
|
||||
.put(PATTERN, "[␟]")
|
||||
.build());
|
||||
|
||||
// Tokenize by whitespace and most special chars
|
||||
tokenizers.put(MAIN_TOKENIZER,
|
||||
ImmutableMap.<String, Object>builder()
|
||||
@ -313,11 +323,22 @@ public class SettingsBuilder {
|
||||
.put(FILTER, ImmutableList.of(LOWERCASE))
|
||||
.build());
|
||||
|
||||
// Analyzer for splitting by unit-separator (used to get depth of browsePathV2)
|
||||
analyzers.put(UNIT_SEPARATOR_PATTERN_ANALYZER, ImmutableMap.<String, Object>builder()
|
||||
.put(TOKENIZER, UNIT_SEPARATOR_TOKENIZER)
|
||||
.put(FILTER, ImmutableList.of(LOWERCASE))
|
||||
.build());
|
||||
|
||||
// Analyzer for matching browse path
|
||||
analyzers.put(BROWSE_PATH_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
|
||||
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
|
||||
.build());
|
||||
|
||||
// Analyzer for matching browse path v2
|
||||
analyzers.put(BROWSE_PATH_V2_HIERARCHY_ANALYZER, ImmutableMap.<String, Object>builder()
|
||||
.put(TOKENIZER, PATH_HIERARCHY_TOKENIZER)
|
||||
.build());
|
||||
|
||||
// Analyzer for case-insensitive exact matching - Only used when building queries
|
||||
analyzers.put(KEYWORD_LOWERCASE_ANALYZER, ImmutableMap.<String, Object>builder()
|
||||
.put(TOKENIZER, KEYWORD_TOKENIZER)
|
||||
|
@ -11,6 +11,7 @@ import javax.annotation.Nonnull;
|
||||
import java.util.Set;
|
||||
|
||||
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
|
||||
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
|
||||
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
|
||||
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
|
||||
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
|
||||
@ -43,6 +44,10 @@ public class SearchFieldConfig {
|
||||
Set.of(
|
||||
SearchableAnnotation.FieldType.BROWSE_PATH
|
||||
);
|
||||
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BROWSE_PATH_V2 =
|
||||
Set.of(
|
||||
SearchableAnnotation.FieldType.BROWSE_PATH_V2
|
||||
);
|
||||
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_BASE_KEYWORD =
|
||||
Set.of(
|
||||
SearchableAnnotation.FieldType.TEXT,
|
||||
@ -125,6 +130,8 @@ public class SearchFieldConfig {
|
||||
// order is important
|
||||
if (TYPES_WITH_BROWSE_PATH.contains(fieldType)) {
|
||||
return BROWSE_PATH_HIERARCHY_ANALYZER;
|
||||
} else if (TYPES_WITH_BROWSE_PATH_V2.contains(fieldType)) {
|
||||
return BROWSE_PATH_V2_HIERARCHY_ANALYZER;
|
||||
// sub fields
|
||||
} else if (isKeyword(fieldName)) {
|
||||
return KEYWORD_ANALYZER;
|
||||
|
@ -13,6 +13,8 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec;
|
||||
import com.linkedin.metadata.models.SearchableFieldSpec;
|
||||
import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType;
|
||||
import com.linkedin.metadata.models.extractor.FieldExtractor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
@ -21,6 +23,8 @@ import java.util.stream.Collectors;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
|
||||
|
||||
/**
|
||||
* Class that provides a utility function that transforms the snapshot object into a search document
|
||||
@ -38,6 +42,8 @@ public class SearchDocumentTransformer {
|
||||
// Maximum customProperties value length
|
||||
private final int maxValueLength;
|
||||
|
||||
private static final String BROWSE_PATH_V2_DELIMITER = "␟";
|
||||
|
||||
public Optional<String> transformSnapshot(final RecordTemplate snapshot, final EntitySpec entitySpec,
|
||||
final Boolean forDelete) {
|
||||
final Map<SearchableFieldSpec, List<Object>> extractedSearchableFields =
|
||||
@ -123,10 +129,15 @@ public class SearchDocumentTransformer {
|
||||
}
|
||||
|
||||
if (isArray || (valueType == DataSchema.Type.MAP && fieldType != FieldType.OBJECT)) {
|
||||
if (fieldType == FieldType.BROWSE_PATH_V2) {
|
||||
String browsePathV2Value = getBrowsePathV2Value(fieldValues);
|
||||
searchDocument.set(fieldName, JsonNodeFactory.instance.textNode(browsePathV2Value));
|
||||
} else {
|
||||
ArrayNode arrayNode = JsonNodeFactory.instance.arrayNode();
|
||||
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength))
|
||||
.forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add));
|
||||
searchDocument.set(fieldName, arrayNode);
|
||||
}
|
||||
} else if (valueType == DataSchema.Type.MAP) {
|
||||
ObjectNode dictDoc = JsonNodeFactory.instance.objectNode();
|
||||
fieldValues.subList(0, Math.min(fieldValues.size(), maxObjectKeys)).forEach(fieldValue -> {
|
||||
@ -197,4 +208,24 @@ public class SearchDocumentTransformer {
|
||||
: Optional.of(JsonNodeFactory.instance.textNode(fieldValue.toString()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The browsePathsV2 aspect is a list of objects and the @Searchable annotation specifies a
|
||||
* list of strings that we receive. However, we want to aggregate those strings and store
|
||||
* as a single string in ElasticSearch so we can do prefix matching against it.
|
||||
*/
|
||||
private String getBrowsePathV2Value(@Nonnull final List<Object> fieldValues) {
|
||||
List<String> stringValues = new ArrayList<>();
|
||||
fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)).forEach(value -> {
|
||||
if (value instanceof String) {
|
||||
stringValues.add((String) value);
|
||||
}
|
||||
});
|
||||
String aggregatedValue = String.join(BROWSE_PATH_V2_DELIMITER, stringValues);
|
||||
// ensure browse path v2 starts with our delimiter if it's not empty
|
||||
if (!aggregatedValue.equals("") && !aggregatedValue.startsWith(BROWSE_PATH_V2_DELIMITER)) {
|
||||
aggregatedValue = BROWSE_PATH_V2_DELIMITER + aggregatedValue;
|
||||
}
|
||||
return aggregatedValue;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
package com.linkedin.metadata;
|
||||
|
||||
import com.datahub.test.TestBrowsePaths;
|
||||
import com.datahub.test.TestBrowsePathsV2;
|
||||
import com.datahub.test.BrowsePathEntry;
|
||||
import com.datahub.test.BrowsePathEntryArray;
|
||||
import com.datahub.test.KeyPartEnum;
|
||||
import com.datahub.test.SearchFeatures;
|
||||
import com.datahub.test.SimpleNestedRecord1;
|
||||
@ -53,12 +56,18 @@ public class TestEntityUtil {
|
||||
snapshot.setUrn(urn);
|
||||
|
||||
TestBrowsePaths browsePaths = new TestBrowsePaths().setPaths(new StringArray(ImmutableList.of("/a/b/c", "d/e/f")));
|
||||
BrowsePathEntryArray browsePathV2Entries = new BrowsePathEntryArray();
|
||||
BrowsePathEntry entry1 = new BrowsePathEntry().setId("levelOne");
|
||||
BrowsePathEntry entry2 = new BrowsePathEntry().setId("levelTwo");
|
||||
browsePathV2Entries.add(entry1);
|
||||
browsePathV2Entries.add(entry2);
|
||||
TestBrowsePathsV2 browsePathsV2 = new TestBrowsePathsV2().setPath(browsePathV2Entries);
|
||||
SearchFeatures searchFeatures = new SearchFeatures().setFeature1(2).setFeature2(1);
|
||||
|
||||
TestEntityAspectArray aspects = new TestEntityAspectArray(
|
||||
ImmutableList.of(TestEntityAspect.create(getTestEntityKey(urn)),
|
||||
TestEntityAspect.create(getTestEntityInfo(urn)), TestEntityAspect.create(browsePaths),
|
||||
TestEntityAspect.create(searchFeatures)));
|
||||
TestEntityAspect.create(searchFeatures), TestEntityAspect.create(browsePathsV2)));
|
||||
snapshot.setAspects(aspects);
|
||||
return snapshot;
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ public class MappingsBuilderTest {
|
||||
Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
|
||||
assertEquals(result.size(), 1);
|
||||
Map<String, Object> properties = (Map<String, Object>) result.get("properties");
|
||||
assertEquals(properties.size(), 16);
|
||||
assertEquals(properties.size(), 17);
|
||||
assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
|
||||
"fields",
|
||||
ImmutableMap.of("delimited",
|
||||
@ -27,6 +27,7 @@ public class MappingsBuilderTest {
|
||||
"analyzer", "partial_urn_component"))));
|
||||
assertEquals(properties.get("runId"), ImmutableMap.of("type", "keyword"));
|
||||
assertTrue(properties.containsKey("browsePaths"));
|
||||
assertTrue(properties.containsKey("browsePathV2"));
|
||||
// KEYWORD
|
||||
Map<String, Object> keyPart3Field = (Map<String, Object>) properties.get("keyPart3");
|
||||
assertEquals(keyPart3Field.get("type"), "keyword");
|
||||
|
@ -1,6 +1,7 @@
|
||||
package com.linkedin.metadata.search.transformer;
|
||||
|
||||
import com.datahub.test.TestEntitySnapshot;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
|
||||
@ -52,6 +53,8 @@ public class SearchDocumentTransformerTest {
|
||||
assertEquals(browsePaths.get(1).asText(), "d/e/f");
|
||||
assertEquals(parsedJson.get("feature1").asInt(), 2);
|
||||
assertEquals(parsedJson.get("feature2").asInt(), 1);
|
||||
JsonNode browsePathV2 = (JsonNode) parsedJson.get("browsePathV2");
|
||||
assertEquals(browsePathV2.asText(), "␟levelOne␟levelTwo");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -15,5 +15,11 @@ record BrowsePathsV2 {
|
||||
* This is stored in elasticsearch as unit-separator delimited strings and only includes platform specific folders or containers.
|
||||
* These paths should not include high level info captured elsewhere ie. Platform and Environment.
|
||||
*/
|
||||
@Searchable = {
|
||||
"/*/id": {
|
||||
"fieldName": "browsePathV2",
|
||||
"fieldType": "BROWSE_PATH_V2"
|
||||
}
|
||||
}
|
||||
path: array[BrowsePathEntry]
|
||||
}
|
||||
|
@ -0,0 +1,19 @@
|
||||
namespace com.datahub.test
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Represents a single level in an entity's browse path
|
||||
*/
|
||||
record BrowsePathEntry {
|
||||
/**
|
||||
* The ID of the browse path entry. This is what gets stored in the index after URL encoding.
|
||||
* If there's an urn associated with this entry, id and urn will be the same
|
||||
*/
|
||||
id: string
|
||||
|
||||
/**
|
||||
* Optional urn pointing to some entity in DataHub
|
||||
*/
|
||||
urn: optional Urn
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
namespace com.datahub.test
|
||||
|
||||
/**
|
||||
* Shared aspect containing Browse Paths V2 to be indexed for an entity.
|
||||
*/
|
||||
@Aspect = {
|
||||
"name": "testBrowsePathsV2"
|
||||
}
|
||||
record TestBrowsePathsV2 {
|
||||
/**
|
||||
* A valid browse path for the entity. This field is provided by DataHub by default.
|
||||
*
|
||||
* Browse paths V2 are stored in elasticsearch as unit-separator delimited strings and only include platform specific folders or containers.
|
||||
* These paths should not include high level info captured elsewhere ie. Platform and Environment.
|
||||
*/
|
||||
@Searchable = {
|
||||
"/*/id": {
|
||||
"fieldName": "browsePathV2",
|
||||
"fieldType": "BROWSE_PATH_V2",
|
||||
}
|
||||
}
|
||||
path: array[BrowsePathEntry]
|
||||
}
|
@ -7,5 +7,6 @@ typeref TestEntityAspect = union[
|
||||
TestEntityKey,
|
||||
TestEntityInfo,
|
||||
TestBrowsePaths,
|
||||
SearchFeatures
|
||||
SearchFeatures,
|
||||
TestBrowsePathsV2
|
||||
]
|
Loading…
x
Reference in New Issue
Block a user