feat(search): Add word gram analyzer for name fields (#8611)

Co-authored-by: Indy Prentice <indy@Indys-MacBook-Pro.local>
This commit is contained in:
Indy Prentice 2023-08-21 15:33:10 -03:00 committed by GitHub
parent a17ed80cf7
commit 8cf299aeb4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 449 additions and 108 deletions

View File

@ -211,7 +211,7 @@ record ServiceKey {
* Name of the service
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string

View File

@ -323,7 +323,7 @@ It takes the following parameters:
annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
the set of mappings to be applied in the MappingsBuilder.
Thus far, we have implemented 10 fieldTypes:
Thus far, we have implemented 11 fieldTypes:
1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering
@ -332,20 +332,25 @@ It takes the following parameters:
3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
matching is expensive, so this field type should not be applied to fields with long values (like description)
4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND
word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is
expensive, so should not be applied to fields with long values such as description.
5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
"urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
7. *BOOLEAN* - Boolean fields used for filtering.
8. *BOOLEAN* - Boolean fields used for filtering.
8. *COUNT* - Count fields used for filtering.
9. *COUNT* - Count fields used for filtering.
9. *DATETIME* - Datetime fields used to represent timestamps.
10. *DATETIME* - Datetime fields used to represent timestamps.
10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
`field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
mapping explosion in Elasticsearch.

View File

@ -21,7 +21,7 @@ public class SearchableAnnotation {
public static final String ANNOTATION_NAME = "Searchable";
private static final Set<FieldType> DEFAULT_QUERY_FIELD_TYPES =
ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);
// Name of the field in the search index. Defaults to the field name in the schema
String fieldName;
@ -59,7 +59,8 @@ public class SearchableAnnotation {
COUNT,
DATETIME,
OBJECT,
BROWSE_PATH_V2
BROWSE_PATH_V2,
WORD_GRAM
}
@Nonnull

View File

@ -142,7 +142,7 @@ public class EntitySpecBuilderTest {
assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());
// Assert on Searchable Fields
assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
@ -158,6 +158,11 @@ public class EntitySpecBuilderTest {
assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("textArrayField", "*").toString())
.getSearchableAnnotation().getFieldType());
assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("wordGramField").toString())
.getSearchableAnnotation().getFieldType());
assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(

View File

@ -42,6 +42,9 @@ public class MappingsBuilder {
// Subfields
public static final String DELIMITED = "delimited";
public static final String LENGTH = "length";
public static final String WORD_GRAMS_LENGTH_2 = "wordGrams2";
public static final String WORD_GRAMS_LENGTH_3 = "wordGrams3";
public static final String WORD_GRAMS_LENGTH_4 = "wordGrams4";
private MappingsBuilder() {
}
@ -94,16 +97,30 @@ public class MappingsBuilder {
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
// Add keyword subfield without lowercase filter
mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
} else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
} else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
mappingForField.put(TYPE, KEYWORD);
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
Map<String, Object> subFields = new HashMap<>();
if (fieldType == FieldType.TEXT_PARTIAL) {
if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
ImmutableMap.of(
ANALYZER, PARTIAL_ANALYZER
)
));
if (fieldType == FieldType.WORD_GRAM) {
for (Map.Entry<String, String> entry : Map.of(
WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
String fieldName = entry.getKey();
String analyzerName = entry.getValue();
subFields.put(fieldName, ImmutableMap.of(
TYPE, TEXT,
ANALYZER, analyzerName,
SEARCH_ANALYZER, analyzerName
));
}
}
}
subFields.put(DELIMITED, ImmutableMap.of(
TYPE, TEXT,

View File

@ -66,6 +66,9 @@ public class SettingsBuilder {
public static final String KEYWORD_ANALYZER = "keyword";
public static final String URN_ANALYZER = "urn_component";
public static final String URN_SEARCH_ANALYZER = "query_urn_component";
public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";
// Filters
public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
@ -80,6 +83,10 @@ public class SettingsBuilder {
public static final String MULTIFILTER = "multifilter";
public static final String MULTIFILTER_GRAPH = "multifilter_graph";
public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
public static final String SHINGLE = "shingle";
public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
public static final String SNOWBALL = "snowball";
public static final String STEM_OVERRIDE = "stem_override";
public static final String STOP = "stop";
@ -108,6 +115,7 @@ public class SettingsBuilder {
public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
// Do not remove the space, needed for multi-term synonyms
public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})",
@ -161,6 +169,13 @@ public class SettingsBuilder {
AUTOCOMPLETE_CUSTOM_DELIMITER,
LOWERCASE);
public static final List<String> WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
ASCII_FOLDING,
LOWERCASE,
TRIM,
REMOVE_QUOTES
);
public final Map<String, Object> settings;
public SettingsBuilder(String mainTokenizer) {
@ -275,6 +290,17 @@ public class SettingsBuilder {
.collect(Collectors.toList()))
.build());
}
for (Map.Entry<String, Integer> entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
String filterName = entry.getKey();
Integer gramSize = entry.getValue();
filters.put(filterName, ImmutableMap.<String, Object>builder()
.put(TYPE, SHINGLE)
.put("min_shingle_size", gramSize)
.put("max_shingle_size", gramSize)
.put("output_unigrams", false)
.build());
}
}
return filters.build();
@ -302,13 +328,24 @@ public class SettingsBuilder {
.put(DELIMITER, "")
.build());
// Tokenize by whitespace and most special chars
// Tokenize by most special chars
// Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
// The split by whitespace is done later in the token filters phase
tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:]")
.build());
// Tokenize by whitespace and most special chars for wordgrams
// only split on - when not preceded by a whitespace to preserve exclusion functionality
// i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
tokenizers.put(WORD_GRAM_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
.build());
return tokenizers.build();
}
@ -382,6 +419,21 @@ public class SettingsBuilder {
.put(FILTER, SEARCH_TOKEN_FILTERS)
.build());
// Support word grams
for (Map.Entry<String, String> entry : Map.of(
WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
String analyzerName = entry.getKey();
String filterName = entry.getValue();
analyzers.put(analyzerName, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, WORD_GRAM_TOKENIZER)
.put(FILTER, ImmutableList.<Object>builder()
.addAll(WORD_GRAM_TOKEN_FILTERS)
.add(filterName).build())
.build());
}
// For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN)
// Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token
analyzers.put(PARTIAL_ANALYZER, ImmutableMap.<String, Object>builder()
@ -395,6 +447,7 @@ public class SettingsBuilder {
.put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS)
.build());
return analyzers.build();
}
}

View File

@ -11,11 +11,8 @@ import javax.annotation.Nonnull;
import java.util.Set;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
@Builder
@Getter
@ -33,7 +30,8 @@ public class SearchFieldConfig {
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_DELIMITED_SUBFIELD =
Set.of(
SearchableAnnotation.FieldType.TEXT,
SearchableAnnotation.FieldType.TEXT_PARTIAL
SearchableAnnotation.FieldType.TEXT_PARTIAL,
SearchableAnnotation.FieldType.WORD_GRAM
// NOT URN_PARTIAL (urn field is special)
);
// NOT comprehensive
@ -56,6 +54,7 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.TEXT,
SearchableAnnotation.FieldType.TEXT_PARTIAL,
SearchableAnnotation.FieldType.KEYWORD,
SearchableAnnotation.FieldType.WORD_GRAM,
// not analyzed
SearchableAnnotation.FieldType.BOOLEAN,
SearchableAnnotation.FieldType.COUNT,
@ -69,6 +68,11 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.URN_PARTIAL
);
public static final Set<SearchableAnnotation.FieldType> TYPES_WITH_WORD_GRAM =
Set.of(
SearchableAnnotation.FieldType.WORD_GRAM
);
@Nonnull
private final String fieldName;
@Nonnull
@ -78,9 +82,11 @@ public class SearchFieldConfig {
private final String analyzer;
private boolean hasKeywordSubfield;
private boolean hasDelimitedSubfield;
private boolean hasWordGramSubfields;
private boolean isQueryByDefault;
private boolean isDelimitedSubfield;
private boolean isKeywordSubfield;
private boolean isWordGramSubfield;
public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) {
final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();
@ -106,6 +112,7 @@ public class SearchFieldConfig {
.analyzer(getAnalyzer(fieldName, fieldType))
.hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType))
.hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType))
.hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType))
.isQueryByDefault(isQueryByDefault)
.build();
}
@ -118,6 +125,11 @@ public class SearchFieldConfig {
return !fieldName.contains(".")
&& ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType));
}
private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) {
return !fieldName.contains(".")
&& (TYPES_WITH_WORD_GRAM.contains(fieldType));
}
private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) {
return !"urn".equals(fieldName)
&& !fieldName.contains(".")
@ -155,6 +167,7 @@ public class SearchFieldConfig {
this.fieldName = fieldName;
isDelimitedSubfield(fieldName.endsWith(".delimited"));
isKeywordSubfield(fieldName.endsWith(".keyword"));
isWordGramSubfield(fieldName.contains("wordGrams"));
shortName(fieldName.split("[.]")[0]);
return this;
}

View File

@ -3,6 +3,7 @@ package com.linkedin.metadata.search.elasticsearch.query.request;
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.config.search.custom.QueryConfiguration;
@ -51,6 +52,9 @@ import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.elasticsearch.search.SearchModule;
import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*;
@Slf4j
public class SearchQueryBuilder {
@ -69,6 +73,7 @@ public class SearchQueryBuilder {
public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q ";
private final ExactMatchConfiguration exactMatchConfiguration;
private final PartialConfiguration partialConfiguration;
private final WordGramConfiguration wordGramConfiguration;
private final CustomizedQueryHandler customizedQueryHandler;
@ -76,6 +81,7 @@ public class SearchQueryBuilder {
@Nullable CustomSearchConfiguration customSearchConfiguration) {
this.exactMatchConfiguration = searchConfiguration.getExactMatch();
this.partialConfiguration = searchConfiguration.getPartial();
this.wordGramConfiguration = searchConfiguration.getWordGram();
this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build();
}
@ -148,6 +154,36 @@ public class SearchQueryBuilder {
fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited",
searchFieldConfig.boost() * partialConfiguration.getFactor(),
searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault()));
if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) {
fields.add(SearchFieldConfig.builder()
.fieldName(searchFieldConfig.fieldName() + ".wordGrams2")
.boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor())
.analyzer(WORD_GRAM_2_ANALYZER)
.hasKeywordSubfield(true)
.hasDelimitedSubfield(true)
.hasWordGramSubfields(true)
.isQueryByDefault(true)
.build());
fields.add(SearchFieldConfig.builder()
.fieldName(searchFieldConfig.fieldName() + ".wordGrams3")
.boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor())
.analyzer(WORD_GRAM_3_ANALYZER)
.hasKeywordSubfield(true)
.hasDelimitedSubfield(true)
.hasWordGramSubfields(true)
.isQueryByDefault(true)
.build());
fields.add(SearchFieldConfig.builder()
.fieldName(searchFieldConfig.fieldName() + ".wordGrams4")
.boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor())
.analyzer(WORD_GRAM_4_ANALYZER)
.hasKeywordSubfield(true)
.hasDelimitedSubfield(true)
.hasWordGramSubfields(true)
.isQueryByDefault(true)
.build());
}
}
}
@ -188,7 +224,7 @@ public class SearchQueryBuilder {
.filter(SearchFieldConfig::isQueryByDefault)
.collect(Collectors.groupingBy(SearchFieldConfig::analyzer));
analyzerGroup.keySet().stream().sorted().forEach(analyzer -> {
analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> {
List<SearchFieldConfig> fieldConfigs = analyzerGroup.get(analyzer);
SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery);
simpleBuilder.analyzer(analyzer);
@ -253,6 +289,13 @@ public class SearchQueryBuilder {
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.fieldName()));
}
if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) {
finalQuery.should(QueryBuilders
.matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
.boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName()))
.queryName(searchFieldConfig.shortName()));
}
});
return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty();
@ -377,4 +420,15 @@ public class SearchQueryBuilder {
throw new RuntimeException(e);
}
}
public float getWordGramFactor(String fieldName) {
if (fieldName.endsWith("Grams2")) {
return wordGramConfiguration.getTwoGramFactor();
} else if (fieldName.endsWith("Grams3")) {
return wordGramConfiguration.getThreeGramFactor();
} else if (fieldName.endsWith("Grams4")) {
return wordGramConfiguration.getFourGramFactor();
}
throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]");
}
}

View File

@ -6,6 +6,7 @@ import com.linkedin.metadata.config.search.ElasticSearchConfiguration;
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.models.registry.ConfigEntityRegistry;
import com.linkedin.metadata.models.registry.EntityRegistry;
@ -55,11 +56,17 @@ public class ESTestConfiguration {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
wordGramConfiguration.setTwoGramFactor(1.2f);
wordGramConfiguration.setThreeGramFactor(1.5f);
wordGramConfiguration.setFourGramFactor(1.8f);
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.5f);
searchConfiguration.setExactMatch(exactMatchConfiguration);
searchConfiguration.setWordGram(wordGramConfiguration);
searchConfiguration.setPartial(partialConfiguration);
return searchConfiguration;
}

View File

@ -116,15 +116,7 @@ public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests {
assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate"));
}
/**
*
* The test below should be added back in as improvements are made to search,
* via the linked tickets.
*
**/
// TODO: enable once PFP-481 is complete
@Test(enabled = false)
@Test
public void testNameMatchPartiallyQualified() {
/*
Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table
@ -140,4 +132,9 @@ public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests {
assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details"));
}
/*
* Tests that should pass but do not yet can be added below here, with the following annotation:
* @Test(enabled = false)
**/
}

View File

@ -358,6 +358,84 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests {
}).collect(Collectors.toList());
}
@Test
public void testNegateAnalysis() throws IOException {
String queryWithMinus = "logging_events -bckp";
AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer(
"smpldat_datasetindex_v2",
"query_word_delimited", queryWithMinus
);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp"));
request = AnalyzeRequest.withIndexAnalyzer(
"smpldat_datasetindex_v2",
"word_gram_3", queryWithMinus
);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp"));
request = AnalyzeRequest.withIndexAnalyzer(
"smpldat_datasetindex_v2",
"word_gram_4", queryWithMinus
);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
}
@Test
public void testWordGram() throws IOException {
String text = "hello.cat_cool_customer";
AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer"));
String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog";
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog"));
String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\"";
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table"));
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table"));
String textWithParens = "(hi) there";
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there"));
String oneWordText = "hello";
for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) {
request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText);
assertEquals(getTokens(request)
.map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
}
}
@Test
public void testUrnSynonym() throws IOException {
List<String> expectedTokens = List.of("bigquery");
@ -1267,6 +1345,53 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests {
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
}
@Test
public void testGram() {
String query = "jaffle shop customers";
SearchResult result = searchAcrossEntities(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)",
"Expected exact match in 1st position");
query = "shop customers source";
result = searchAcrossEntities(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)",
"Expected ngram match in 1st position");
query = "jaffle shop stg customers";
result = searchAcrossEntities(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)",
"Expected ngram match in 1st position");
query = "jaffle shop transformers customers";
result = searchAcrossEntities(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)",
"Expected ngram match in 1st position");
query = "shop raw customers";
result = searchAcrossEntities(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)",
"Expected ngram match in 1st position");
}
@Test
public void testPrefixVsExact() {

View File

@ -16,7 +16,7 @@ public class MappingsBuilderTest {
Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
assertEquals(result.size(), 1);
Map<String, Object> properties = (Map<String, Object>) result.get("properties");
assertEquals(properties.size(), 17);
assertEquals(properties.size(), 18);
assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
"fields",
ImmutableMap.of("delimited",
@ -76,6 +76,19 @@ public class MappingsBuilderTest {
assertTrue(textArrayFieldSubfields.containsKey("ngram"));
assertTrue(textArrayFieldSubfields.containsKey("keyword"));
// WORD_GRAM
Map<String, Object> wordGramField = (Map<String, Object>) properties.get("wordGramField");
assertEquals(wordGramField.get("type"), "keyword");
assertEquals(wordGramField.get("normalizer"), "keyword_normalizer");
Map<String, Object> wordGramFieldSubfields = (Map<String, Object>) wordGramField.get("fields");
assertEquals(wordGramFieldSubfields.size(), 6);
assertTrue(wordGramFieldSubfields.containsKey("delimited"));
assertTrue(wordGramFieldSubfields.containsKey("ngram"));
assertTrue(wordGramFieldSubfields.containsKey("keyword"));
assertTrue(wordGramFieldSubfields.containsKey("wordGrams2"));
assertTrue(wordGramFieldSubfields.containsKey("wordGrams3"));
assertTrue(wordGramFieldSubfields.containsKey("wordGrams4"));
// URN
Map<String, Object> foreignKey = (Map<String, Object>) properties.get("foreignKey");
assertEquals(foreignKey.get("type"), "text");

View File

@ -4,6 +4,7 @@ import com.linkedin.metadata.config.search.CustomConfiguration;
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.fasterxml.jackson.dataformat.yaml.YAMLMapper;
import com.google.common.collect.ImmutableList;
@ -18,6 +19,7 @@ import com.linkedin.util.Pair;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchAllQueryBuilder;
import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
@ -46,11 +48,17 @@ public class SearchQueryBuilderTest {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
wordGramConfiguration.setTwoGramFactor(1.2f);
wordGramConfiguration.setThreeGramFactor(1.5f);
wordGramConfiguration.setFourGramFactor(1.8f);
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.7f);
testQueryConfig.setExactMatch(exactMatchConfiguration);
testQueryConfig.setWordGram(wordGramConfiguration);
testQueryConfig.setPartial(partialConfiguration);
}
public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null);
@ -70,16 +78,17 @@ public class SearchQueryBuilderTest {
assertEquals(keywordQuery.value(), "testQuery");
assertEquals(keywordQuery.analyzer(), "keyword");
Map<String, Float> keywordFields = keywordQuery.fields();
assertEquals(keywordFields.size(), 8);
assertEquals(keywordFields.size(), 9);
assertEquals(keywordFields, Map.of(
"urn", 10.f,
"textArrayField", 1.0f,
"customProperties", 1.0f,
"nestedArrayArrayField", 1.0f,
"textFieldOverride", 1.0f,
"nestedArrayStringField", 1.0f,
"keyPart1", 10.0f,
"esObjectField", 1.0f
"urn", 10.f,
"textArrayField", 1.0f,
"customProperties", 1.0f,
"wordGramField", 1.0f,
"nestedArrayArrayField", 1.0f,
"textFieldOverride", 1.0f,
"nestedArrayStringField", 1.0f,
"keyPart1", 10.0f,
"esObjectField", 1.0f
));
SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1);
@ -99,7 +108,8 @@ public class SearchQueryBuilderTest {
"nestedArrayArrayField.delimited", 0.4f,
"urn.delimited", 7.0f,
"textArrayField.delimited", 0.4f,
"nestedArrayStringField.delimited", 0.4f
"nestedArrayStringField.delimited", 0.4f,
"wordGramField.delimited", 0.4f
));
BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1);
@ -109,21 +119,30 @@ public class SearchQueryBuilderTest {
if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
} else {
} else if (prefixQuery instanceof TermQueryBuilder) {
// exact
TermQueryBuilder builder = (TermQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
} else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
// ngram
MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
}
}).collect(Collectors.toList());
assertEquals(prefixFieldWeights.size(), 22);
assertEquals(prefixFieldWeights.size(), 28);
List.of(
Pair.of("urn", 100.0f),
Pair.of("urn", 70.0f),
Pair.of("keyPart1.delimited", 16.8f),
Pair.of("keyPart1.keyword", 100.0f),
Pair.of("keyPart1.keyword", 70.0f)
Pair.of("keyPart1.keyword", 70.0f),
Pair.of("wordGramField.wordGrams2", 1.44f),
Pair.of("wordGramField.wordGrams3", 2.25f),
Pair.of("wordGramField.wordGrams4", 3.2399998f),
Pair.of("wordGramField.keyword", 10.0f),
Pair.of("wordGramField.keyword", 7.0f)
).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p));
// Validate scorer
@ -144,7 +163,7 @@ public class SearchQueryBuilderTest {
assertEquals(keywordQuery.queryString(), "testQuery");
assertNull(keywordQuery.analyzer());
Map<String, Float> keywordFields = keywordQuery.fields();
assertEquals(keywordFields.size(), 16);
assertEquals(keywordFields.size(), 21);
assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f);
assertFalse(keywordFields.containsKey("keyPart3"));
assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f);
@ -196,10 +215,14 @@ public class SearchQueryBuilderTest {
List<QueryBuilder> queries = boolPrefixQuery.should().stream().map(prefixQuery -> {
if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
// prefix
return (MatchPhrasePrefixQueryBuilder) prefixQuery;
} else {
} else if (prefixQuery instanceof TermQueryBuilder) {
// exact
return (TermQueryBuilder) prefixQuery;
} else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
// ngram
return (MatchPhraseQueryBuilder) prefixQuery;
}
}).collect(Collectors.toList());

View File

@ -7,6 +7,7 @@ import com.google.common.collect.ImmutableList;
import com.linkedin.data.template.StringArray;
import com.linkedin.metadata.ESTestConfiguration;
import com.linkedin.metadata.TestEntitySpecBuilder;
import com.linkedin.metadata.config.search.WordGramConfiguration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
wordGramConfiguration.setTwoGramFactor(1.2f);
wordGramConfiguration.setThreeGramFactor(1.5f);
wordGramConfiguration.setFourGramFactor(1.8f);
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.7f);
testQueryConfig.setExactMatch(exactMatchConfiguration);
testQueryConfig.setWordGram(wordGramConfiguration);
testQueryConfig.setPartial(partialConfiguration);
}
@ -113,10 +120,10 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests {
HighlightBuilder highlightBuilder = sourceBuilder.highlighter();
List<String> fields =
highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList());
assertEquals(fields.size(), 20);
assertEquals(fields.size(), 22);
List<String> highlightableFields =
ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey",
"nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField");
"nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField");
highlightableFields.forEach(field -> {
assertTrue(fields.contains(field), "Missing: " + field);
assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*");

View File

@ -20,7 +20,7 @@ record ChartInfo includes CustomProperties, ExternalReference {
* Title of the chart
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
title: string

View File

@ -15,7 +15,7 @@ record ContainerProperties includes CustomProperties, ExternalReference {
* Display name of the Asset Container
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -25,7 +25,7 @@ record ContainerProperties includes CustomProperties, ExternalReference {
* Fully-qualified name of the Container
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -61,4 +61,4 @@ record ContainerProperties includes CustomProperties, ExternalReference {
}
}
lastModified: optional TimeStamp
}
}

View File

@ -22,7 +22,7 @@ record DashboardInfo includes CustomProperties, ExternalReference {
* Title of the dashboard
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -126,4 +126,4 @@ record DashboardInfo includes CustomProperties, ExternalReference {
* The time when this dashboard last refreshed
*/
lastRefreshed: optional Time
}
}

View File

@ -17,7 +17,7 @@ record DataFlowInfo includes CustomProperties, ExternalReference {
* Flow name
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -18,7 +18,7 @@ record DataJobInfo includes CustomProperties, ExternalReference {
* Job name
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -15,7 +15,7 @@ record DataPlatformInfo {
*/
@validate.strlen.max = 15
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": false,
"boostScore": 10.0
}
@ -25,7 +25,7 @@ record DataPlatformInfo {
* The name that will be used for displaying a platform type.
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -16,7 +16,7 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen
* Display name of the Data Platform Instance
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc
* Process name
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -13,7 +13,7 @@ record DataProductProperties includes CustomProperties, ExternalReference {
* Display name of the Data Product
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -17,7 +17,7 @@ record DatasetProperties includes CustomProperties, ExternalReference {
* Display name of the Dataset
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -27,7 +27,7 @@ record DatasetProperties includes CustomProperties, ExternalReference {
* Fully-qualified name of the Dataset
*/
@Searchable = {
"fieldType": "TEXT",
"fieldType": "WORD_GRAM",
"addToFilters": false,
"enableAutocomplete": true,
"boostScore": 10.0
@ -77,4 +77,4 @@ record DatasetProperties includes CustomProperties, ExternalReference {
*/
@deprecated = "Use GlobalTags aspect instead."
tags: array[string] = [ ]
}
}

View File

@ -14,7 +14,7 @@ record DomainProperties {
* Display name of the Domain
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -35,7 +35,7 @@ record GlossaryNodeInfo {
*/
@Searchable = {
"fieldName": "displayName",
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -49,4 +49,4 @@ record GlossaryNodeInfo {
}
id: optional string
}
}

View File

@ -23,7 +23,7 @@ record GlossaryTermInfo includes CustomProperties {
* Display name of the term
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -75,4 +75,4 @@ record GlossaryTermInfo includes CustomProperties {
*/
@deprecated
rawSchema: optional string
}
}

View File

@ -45,7 +45,7 @@ record CorpUserEditableInfo {
* DataHub-native display name
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"queryByDefault": true,
"boostScore": 10.0
}

View File

@ -26,7 +26,7 @@ record CorpUserInfo includes CustomProperties {
* displayName of this user , e.g. Hang Zhang(DataHQ)
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
@ -89,7 +89,7 @@ record CorpUserInfo includes CustomProperties {
* Common name of this user, format is firstName + lastName (split by a whitespace)
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0

View File

@ -11,10 +11,10 @@ record CorpGroupKey {
* The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub.
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
name: string
}
}

View File

@ -12,7 +12,7 @@ record CorpUserKey {
*/
@Searchable = {
"fieldName": "ldap",
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"boostScore": 2.0,
"enableAutocomplete": true
}

View File

@ -19,7 +19,7 @@ record DataFlowKey {
* Unique Identifier of the data flow
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
flowId: string
@ -31,4 +31,4 @@ record DataFlowKey {
"fieldType": "TEXT_PARTIAL"
}
cluster: string
}
}

View File

@ -27,7 +27,7 @@ record DataJobKey {
* Unique Identifier of the data job
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
jobId: string

View File

@ -13,7 +13,7 @@ record DataProcessKey {
* Process name i.e. an ETL job name
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 4.0
}
@ -37,4 +37,4 @@ record DataProcessKey {
"queryByDefault": false
}
origin: FabricType
}
}

View File

@ -25,7 +25,7 @@ record DatasetKey {
//This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead.
@Searchable = {
"fieldName": "id"
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -12,9 +12,9 @@ import com.linkedin.common.FabricType
record GlossaryNodeKey {
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string
}
}

View File

@ -13,10 +13,10 @@ record GlossaryTermKey {
* The term name, which serves as a unique id
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"fieldName": "id"
}
name: string
}
}

View File

@ -20,9 +20,9 @@ record MLFeatureKey {
* Name of the feature
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 8.0
}
name: string
}
}

View File

@ -22,9 +22,9 @@ record MLFeatureTableKey {
* Name of the feature table
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 8.0
}
name: string
}
}

View File

@ -19,7 +19,7 @@ record MLModelDeploymentKey {
* Name of the MLModelDeployment
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -35,4 +35,4 @@ record MLModelDeploymentKey {
"queryByDefault": false
}
origin: FabricType
}
}

View File

@ -19,7 +19,7 @@ record MLModelGroupKey {
* Name of the MLModelGroup
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -33,4 +33,4 @@ record MLModelGroupKey {
"queryByDefault": false
}
origin: FabricType
}
}

View File

@ -19,7 +19,7 @@ record MLModelKey {
* Name of the MLModel
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -35,4 +35,4 @@ record MLModelKey {
"queryByDefault": false
}
origin: FabricType
}
}

View File

@ -21,9 +21,9 @@ record MLPrimaryKeyKey {
* Name of the primary key
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 8.0
}
name: string
}
}

View File

@ -11,10 +11,10 @@ record TagKey {
* The tag name, which serves as a unique id
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0,
"fieldName": "id"
}
name: string
}
}

View File

@ -18,7 +18,7 @@ record NotebookInfo includes CustomProperties, ExternalReference {
* Title of the Notebook
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -14,7 +14,7 @@ record OwnershipTypeInfo {
* Display name of the Ownership Type
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -54,4 +54,4 @@ record OwnershipTypeInfo {
}
}
lastModified: AuditStamp
}
}

View File

@ -29,7 +29,7 @@ record QueryProperties {
* Optional display name to identify the query.
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@ -69,4 +69,4 @@ record QueryProperties {
}
}
lastModified: AuditStamp
}
}

View File

@ -14,7 +14,7 @@ record RoleProperties {
* Display name of the IAM Role in the external system
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -11,7 +11,7 @@ record TagProperties {
* Display name of the tag
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}

View File

@ -11,4 +11,5 @@ public class SearchConfiguration {
private PartialConfiguration partial;
private CustomConfiguration custom;
private GraphQueryConfiguration graph;
private WordGramConfiguration wordGram;
}

View File

@ -0,0 +1,11 @@
package com.linkedin.metadata.config.search;
import lombok.Data;
@Data
public class WordGramConfiguration {
private float twoGramFactor;
private float threeGramFactor;
private float fourGramFactor;
}

View File

@ -198,6 +198,10 @@ elasticsearch:
prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix
caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch
enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search
wordGram:
twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens
threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens
fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens
# Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments
partial:
urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed
@ -318,4 +322,4 @@ cache:
search:
lineage:
ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day
lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}

View File

@ -25,6 +25,11 @@ record TestEntityInfo includes CustomProperties {
}
textArrayField: optional array[string]
@Searchable = {
"fieldType": "WORD_GRAM"
}
wordGramField: optional string
@Relationship = {
"name": "foreignKey",
"entityTypes": []