feat(elasticsearch): advanced query, identity autocomplete, exact match weight (#7354)

This commit is contained in:
david-leifker 2023-02-20 09:42:17 -06:00 committed by GitHub
parent 8fd2cc5f20
commit 7bbabe0762
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 385 additions and 65 deletions

View File

@ -5,7 +5,7 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';
<!-- All Feature Guides should begin with `About DataHub ` to improve SEO -->
<!--
Update feature availability; by default, feature availabilty is Self-Hosted and Managed DataHub
Update feature availability; by default, feature availability is Self-Hosted and Managed DataHub
Add in `saasOnly` for Managed DataHub-only features
-->
@ -84,36 +84,47 @@ With better metadata comes better results. Learn more about ingestion technical
The search bar supports advanced queries with pattern matching, logical expressions and filtering by specific field matches.
The following examples are in the format of
X: *typical question* :
```what to key in search bar```. [sample url](https://example.com)
Wildcard characters can be added to the search terms as well. These examples are non exhaustive and using Datasets as a reference.
The following are use cases with example search phrases. Additionally, an example link is provided for our demo instance.
These examples are non exhaustive and using Datasets as a reference.
If you want to:
1. Find a dataset with the word **mask** in the name:
```name: *mask*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=name%3A%20%2Amask%2A)
This will return entities with **mask** in the name.
Names tends to be connected by other symbols, hence the wildcard symbols before and after the word.
If you want to:
2. Find a dataset with a property, **encoding**
```customProperties: encoding*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=customProperties%3A%20encoding%2A)
Dataset Properties are indexed in ElasticSearch the manner of key=value. Hence if you know the precise key-value pair, you can search using ```key=value```. However, if you only know the key, you can use wildcards to replace the value and that is what is being done here.
- Exact match on term or phrase
- ```"datahub_schema"``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%22datahub_schema%22)
- ```datahub_schema``` [Sample results](https://demo.datahubproject.io/search?page=1&query=datahub_schema)
- Enclosing one or more terms with double quotes will enforce exact matching on these terms, preventing further tokenization.
3. Find a dataset with a column name, **latitude**
```fieldPaths: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=fieldPaths%3A%20latitude)
fieldPaths is the name of the attribute that holds the column name in Datasets.
- Exclude terms
- ```logging -snowflake``` [Sample results](https://demo.datahubproject.io/search?page=1&query=logging%20-snowflake)
- Results can be excluded by term using `-` to negate the term.
4. Find a dataset with the term **latitude** in the field description
```editedFieldDescriptions: latitude OR fieldDescriptions: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=editedFieldDescriptions%3A%20latitude%20OR%20fieldDescriptions%3A%20latitude)
Datasets has 2 attributes that contains field description. fieldDescription comes from the SchemaMetadata aspect, while editedFieldDescriptions comes from the EditableSchemaMetadata aspect. EditableSchemaMetadata holds information that comes from UI edits, while SchemaMetadata holds data from ingestion of the dataset.
- Term boolean logic with precedence
- ```logging + (-snowflake | os_audit_log)``` [Sample results](https://demo.datahubproject.io/search?page=1&query=logging%20%2B%20%28-snowflake%20%7C%20os_audit_log%29)
- `(` `)` can be used to set precedence of boolean term expressions
5. Find a dataset with the term **logical** in the dataset description
```editedDescription: *logical* OR description: *logical*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=editedDescription%3A%20%2Alogical%2A%20OR%20description%3A%20%2Alogical%2A)
Similar to field descriptions, dataset descriptions can be found in 2 aspects, hence the need to search 2 attributes.
- Find a dataset with the word **mask** in the name:
- ```/q name: *mask*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20name%253A%2520%2Amask%2A)
- This will return entities with **mask** in the name. Names tends to be connected by other symbols, hence the wildcard symbols before and after the word.
6. Find a dataset which reside in one of the browsing folders, for instance, the **hive** folder
```browsePaths: *hive*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=browsePaths%3A%20%2Ahive%2A)
BrowsePath is stored as a complete string, for instance ```/datasets/prod/hive/SampleKafkaDataset```, hence the need for wildcards on both ends of the term to return a result.
- Find a dataset with a property, **encoding**
- ```/q customProperties: encoding*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20customProperties%3A%20encoding%2A)
- Dataset Properties are indexed in ElasticSearch the manner of key=value. Hence if you know the precise key-value pair, you can search using ```"key=value"```. However, if you only know the key, you can use wildcards to replace the value and that is what is being done here.
- Find a dataset with a column name, **latitude**
- ```/q fieldPaths: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20fieldPaths%3A%20latitude)
- fieldPaths is the name of the attribute that holds the column name in Datasets.
- Find a dataset with the term **latitude** in the field description
- ```/q editedFieldDescriptions: latitude OR fieldDescriptions: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20editedFieldDescriptions%3A%20latitude%20OR%20fieldDescriptions%3A%20latitude)
- Datasets has 2 attributes that contains field description. fieldDescription comes from the SchemaMetadata aspect, while editedFieldDescriptions comes from the EditableSchemaMetadata aspect. EditableSchemaMetadata holds information that comes from UI edits, while SchemaMetadata holds data from ingestion of the dataset.
- Find a dataset with the term **logical** in the dataset description
- ```/q editedDescription: *logical* OR description: *logical*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20editedDescription%3A%20%2Alogical%2A%20OR%20description%3A%20%2Alogical%2A)
- Similar to field descriptions, dataset descriptions can be found in 2 aspects, hence the need to search 2 attributes.
- Find a dataset which reside in one of the browsing folders, for instance, the **hive** folder
- ```/q browsePaths: *hive*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20browsePaths%3A%20%2Ahive%2A)
- BrowsePath is stored as a complete string, for instance ```/datasets/prod/hive/SampleKafkaDataset```, hence the need for wildcards on both ends of the term to return a result.
<!--
## Additional Resources

View File

@ -19,7 +19,8 @@ import org.springframework.cache.CacheManager;
import java.util.Optional;
import static com.datahub.util.RecordUtils.*;
import static com.datahub.util.RecordUtils.toJsonString;
import static com.datahub.util.RecordUtils.toRecordTemplate;
@RequiredArgsConstructor

View File

@ -148,8 +148,7 @@ public class SettingsBuilder {
public static final List<String> PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS = ImmutableList.of(
ASCII_FOLDING,
AUTOCOMPLETE_CUSTOM_DELIMITER,
LOWERCASE,
DATAHUB_STOP_WORDS);
LOWERCASE);
public final Map<String, Object> settings;

View File

@ -32,6 +32,9 @@ import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_
public class SearchQueryBuilder {
public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q ";
public static final float EXACT_MATCH_BOOST_FACTOR = 10.0f;
private static final Set<FieldType> TYPES_WITH_DELIMITED_SUBFIELD =
new HashSet<>(Arrays.asList(FieldType.TEXT, FieldType.TEXT_PARTIAL));
@ -56,13 +59,15 @@ public class SearchQueryBuilder {
private static QueryBuilder buildInternalQuery(@Nonnull EntitySpec entitySpec, @Nonnull String query, boolean fulltext) {
BoolQueryBuilder finalQuery = QueryBuilders.boolQuery();
if (fulltext) {
if (fulltext && !query.startsWith(STRUCTURED_QUERY_PREFIX)) {
SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(query.replaceFirst("^:+", ""));
simpleBuilder.defaultOperator(Operator.AND);
getStandardFields(entitySpec).forEach(fieldBoost -> simpleBuilder.field(fieldBoost.getFirst(), fieldBoost.getSecond()));
finalQuery.should(simpleBuilder);
} else {
QueryStringQueryBuilder queryBuilder = QueryBuilders.queryStringQuery(query);
final String withoutQueryPrefix = query.startsWith(STRUCTURED_QUERY_PREFIX) ? query.substring(STRUCTURED_QUERY_PREFIX.length()) : query;
QueryStringQueryBuilder queryBuilder = QueryBuilders.queryStringQuery(withoutQueryPrefix);
queryBuilder.defaultOperator(Operator.AND);
getStandardFields(entitySpec).forEach(fieldBoost -> queryBuilder.field(fieldBoost.getFirst(), fieldBoost.getSecond()));
finalQuery.should(queryBuilder);
@ -89,14 +94,14 @@ public class SearchQueryBuilder {
String fieldName = fieldSpec.getSearchableAnnotation().getFieldName();
double boostScore = fieldSpec.getSearchableAnnotation().getBoostScore();
fields.add(Pair.of(fieldName, (float) (boostScore)));
fields.add(Pair.of(fieldName, (float) boostScore));
FieldType fieldType = fieldSpec.getSearchableAnnotation().getFieldType();
if (TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType)) {
fields.add(Pair.of(fieldName + ".delimited", (float) (boostScore * 0.4)));
fields.add(Pair.of(fieldName + ".delimited", ((float) boostScore) * 0.4f));
}
if (FieldType.URN_PARTIAL.equals(fieldType)) {
fields.add(Pair.of(fieldName + ".delimited", (float) (boostScore * 0.4)));
fields.add(Pair.of(fieldName + ".delimited", ((float) boostScore) * 0.4f));
}
}
@ -104,26 +109,31 @@ public class SearchQueryBuilder {
}
private static Optional<QueryBuilder> getPrefixQuery(@Nonnull EntitySpec entitySpec, String query) {
BoolQueryBuilder finalQuery = QueryBuilders.boolQuery();
BoolQueryBuilder finalQuery = QueryBuilders.boolQuery();
if (query.contains("\"")) {
finalQuery.should(QueryBuilders.termQuery("urn", query.replaceAll("\"", ""))
.boost(Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore")) * EXACT_MATCH_BOOST_FACTOR)
.queryName("urn"));
}
entitySpec.getSearchableFieldSpecs().stream()
.map(SearchableFieldSpec::getSearchableAnnotation)
.filter(SearchableAnnotation::isQueryByDefault)
.filter(SearchableAnnotation::isEnableAutocomplete)
.filter(SearchableAnnotation::isEnableAutocomplete) // Proxy for identifying likely exact match fields
.filter(e -> TYPES_WITH_DELIMITED_SUBFIELD.contains(e.getFieldType()))
.forEach(fieldSpec -> finalQuery.should(
QueryBuilders.matchPhrasePrefixQuery(fieldSpec.getFieldName() + ".delimited", query)
.boost((float) fieldSpec.getBoostScore())));
.forEach(fieldSpec -> {
finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(fieldSpec.getFieldName() + ".delimited", query)
.boost((float) fieldSpec.getBoostScore())
.queryName(fieldSpec.getFieldName())); // less than exact
if (query.contains("\"")) {
finalQuery.should(QueryBuilders.termQuery(fieldSpec.getFieldName() + ".keyword", query.replaceAll("\"", ""))
.boost(Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore")) * EXACT_MATCH_BOOST_FACTOR)
.queryName(fieldSpec.getFieldName() + ".keyword"));
}
});
return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty();
}
private static QueryBuilder getPhraseQuery(@Nonnull EntitySpec entitySpec, String query) {
BoolQueryBuilder finalQuery = QueryBuilders.boolQuery();
getStandardFields(entitySpec).stream()
.filter(p -> p.getFirst().endsWith(".delimited"))
.forEach(p -> finalQuery.should(QueryBuilders.matchPhraseQuery(p.getFirst(), query).boost(p.getSecond())));
return finalQuery;
}
private static FunctionScoreQueryBuilder.FilterFunctionBuilder[] buildScoreFunctions(@Nonnull EntitySpec entitySpec) {
List<FunctionScoreQueryBuilder.FilterFunctionBuilder> finalScoreFunctions = new ArrayList<>();
// Add a default weight of 1.0 to make sure the score function is larger than 1

View File

@ -76,6 +76,7 @@ public class SearchRequestHandler {
private final EntitySpec _entitySpec;
private final Set<String> _facetFields;
private final Set<String> _defaultQueryFieldNames;
private final HighlightBuilder _highlights;
private final Map<String, String> _filtersToDisplayName;
private final Configs _configs;
@ -95,6 +96,7 @@ public class SearchRequestHandler {
_entitySpec = entitySpec;
_facetFields = getFacetFields();
_defaultQueryFieldNames = getDefaultQueryFieldNames();
_highlights = getHighlights();
_filtersToDisplayName = _entitySpec.getSearchableFieldSpecs()
.stream()
.filter(spec -> spec.getSearchableAnnotation().isAddToFilters())
@ -175,7 +177,7 @@ public class SearchRequestHandler {
.must(getQuery(input, fulltext))
.must(filterQuery));
getAggregations().forEach(searchSourceBuilder::aggregation);
searchSourceBuilder.highlighter(getHighlights());
searchSourceBuilder.highlighter(_highlights);
ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion);
searchRequest.source(searchSourceBuilder);
log.debug("Search request is: " + searchRequest.toString());
@ -247,14 +249,16 @@ public class SearchRequestHandler {
private HighlightBuilder getHighlights() {
HighlightBuilder highlightBuilder = new HighlightBuilder();
// Don't set tags to get the original field value
highlightBuilder.preTags("");
highlightBuilder.postTags("");
// Check for each field name and any subfields
_defaultQueryFieldNames.forEach(fieldName -> highlightBuilder
.field(fieldName)
.field(fieldName + ".*"));
highlightBuilder.field("urn.delimited");
_defaultQueryFieldNames.stream()
.flatMap(fieldName -> Stream.of(fieldName, fieldName + ".*")).distinct()
.forEach(highlightBuilder::field);
return highlightBuilder;
}
@ -272,7 +276,8 @@ public class SearchRequestHandler {
}
@Nonnull
private List<MatchedField> extractMatchedFields(@Nonnull Map<String, HighlightField> highlightedFields) {
private List<MatchedField> extractMatchedFields(@Nonnull SearchHit hit) {
Map<String, HighlightField> highlightedFields = hit.getHighlightFields();
// Keep track of unique field values that matched for a given field name
Map<String, Set<String>> highlightedFieldNamesAndValues = new HashMap<>();
for (Map.Entry<String, HighlightField> entry : highlightedFields.entrySet()) {
@ -288,6 +293,12 @@ public class SearchRequestHandler {
highlightedFieldNamesAndValues.get(fieldName.get()).add(fieldValue.string());
}
}
// fallback matched query, non-analyzed field
for (String queryName : hit.getMatchedQueries()) {
if (!highlightedFieldNamesAndValues.containsKey(queryName)) {
highlightedFieldNamesAndValues.put(queryName, Set.of(""));
}
}
return highlightedFieldNamesAndValues.entrySet()
.stream()
.flatMap(
@ -306,7 +317,7 @@ public class SearchRequestHandler {
private SearchEntity getResult(@Nonnull SearchHit hit) {
return new SearchEntity().setEntity(getUrnFromSearchHit(hit))
.setMatchedFields(new MatchedFieldArray(extractMatchedFields(hit.getHighlightFields())))
.setMatchedFields(new MatchedFieldArray(extractMatchedFields(hit)))
.setScore(hit.getScore())
.setFeatures(new DoubleMap(extractFeatures(hit)));
}

View File

@ -5,6 +5,8 @@ import com.linkedin.common.urn.Urn;
import com.linkedin.datahub.graphql.generated.AutoCompleteResults;
import com.linkedin.datahub.graphql.types.chart.ChartType;
import com.linkedin.datahub.graphql.types.container.ContainerType;
import com.linkedin.datahub.graphql.types.corpgroup.CorpGroupType;
import com.linkedin.datahub.graphql.types.corpuser.CorpUserType;
import com.linkedin.datahub.graphql.types.dataset.DatasetType;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.ESSampleDataFixture;
@ -35,6 +37,7 @@ import java.util.stream.Stream;
import static com.linkedin.metadata.ESTestUtils.autocomplete;
import static com.linkedin.metadata.ESTestUtils.search;
import static com.linkedin.metadata.ESTestUtils.searchStructured;
import static com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder.STRUCTURED_QUERY_PREFIX;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertNotNull;
import static org.testng.Assert.assertSame;
@ -444,31 +447,68 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests {
});
}
@Test
public void testGroupAutoComplete() {
List.of("T", "Te", "Tes", "Test ", "Test G", "Test Gro", "Test Group ")
.forEach(query -> {
try {
AutoCompleteResults result = autocomplete(new CorpGroupType(entityClient), query);
assertTrue(result.getEntities().size() == 1,
String.format("Expected 1 results for `%s` found %s", query, result.getEntities().size()));
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
@Test
public void testUserAutoComplete() {
List.of("D", "Da", "Dat", "Data ", "Data H", "Data Hu", "Data Hub", "Data Hub ")
.forEach(query -> {
try {
AutoCompleteResults result = autocomplete(new CorpUserType(entityClient, null), query);
assertTrue(result.getEntities().size() >= 1,
String.format("Expected at least 1 results for `%s` found %s", query, result.getEntities().size()));
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
@Test
public void testSmokeTestQueries() {
Map<String, Integer> expectedMinimums = Map.of(
Map<String, Integer> expectedFulltextMinimums = Map.of(
"sample", 3,
"covid", 2,
"\"raw_orders\"", 1
"\"raw_orders\"", 1,
STRUCTURED_QUERY_PREFIX + "sample", 1,
STRUCTURED_QUERY_PREFIX + "covid", 0,
STRUCTURED_QUERY_PREFIX + "\"raw_orders\"", 1
);
Map<String, SearchResult> results = expectedMinimums.entrySet().stream()
Map<String, SearchResult> results = expectedFulltextMinimums.entrySet().stream()
.collect(Collectors.toMap(Map.Entry::getKey, entry -> search(searchService, entry.getKey())));
results.forEach((key, value) -> {
Integer actualCount = value.getEntities().size();
Integer expectedCount = expectedMinimums.get(key);
Integer expectedCount = expectedFulltextMinimums.get(key);
assertSame(actualCount, expectedCount,
String.format("Search term `%s` has %s fulltext results, expected %s results.", key, actualCount,
expectedCount));
});
results = expectedMinimums.entrySet().stream()
Map<String, Integer> expectedStructuredMinimums = Map.of(
"sample", 3,
"covid", 2,
"\"raw_orders\"", 1
);
results = expectedStructuredMinimums.entrySet().stream()
.collect(Collectors.toMap(Map.Entry::getKey, entry -> searchStructured(searchService, entry.getKey())));
results.forEach((key, value) -> {
Integer actualCount = value.getEntities().size();
Integer expectedCount = expectedMinimums.get(key);
Integer expectedCount = expectedStructuredMinimums.get(key);
assertSame(actualCount, expectedCount,
String.format("Search term `%s` has %s structured results, expected %s results.", key, actualCount,
expectedCount));
@ -624,6 +664,241 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests {
});
}
@Test
public void testStructQueryFieldMatch() {
String query = STRUCTURED_QUERY_PREFIX + "name: customers";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 1);
}
@Test
public void testStructQueryFieldPrefixMatch() {
String query = STRUCTURED_QUERY_PREFIX + "name: customers*";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
}
@Test
public void testStructQueryCustomPropertiesKeyPrefix() {
String query = STRUCTURED_QUERY_PREFIX + "customProperties: node_type=*";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 9);
}
@Test
public void testStructQueryCustomPropertiesMatch() {
String query = STRUCTURED_QUERY_PREFIX + "customProperties: node_type=model";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 5);
}
@Test
public void testStructQueryFieldPaths() {
String query = STRUCTURED_QUERY_PREFIX + "fieldPaths: customer_id";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 3);
}
@Test
public void testStructQueryBoolean() {
String query = STRUCTURED_QUERY_PREFIX + "editedFieldTags:urn\\:li\\:tag\\:Legacy OR tags:urn\\:li\\:tag\\:testTag";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
query = STRUCTURED_QUERY_PREFIX + "editedFieldTags:urn\\:li\\:tag\\:Legacy";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 1);
query = STRUCTURED_QUERY_PREFIX + "tags:urn\\:li\\:tag\\:testTag";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 1);
}
@Test
public void testStructQueryBrowsePaths() {
String query = STRUCTURED_QUERY_PREFIX + "browsePaths:*/dbt/*";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 9);
}
@Test
public void testOr() {
String query = "stg_customers | logging_events";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 9);
query = "stg_customers";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 1);
query = "logging_events";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 8);
}
@Test
public void testNegate() {
String query = "logging_events -bckp";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 7);
query = "logging_events";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 8);
}
@Test
public void testPrefix() {
String query = "bigquery";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 8);
query = "big*";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 8);
}
@Test
public void testParens() {
String query = "dbt | (bigquery + covid19)";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 11);
query = "dbt";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 9);
query = "bigquery + covid19";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
query = "bigquery";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 8);
query = "covid19";
result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
}
@Test
public void testPrefixVsExact() {
String query = "\"customers\"";
SearchResult result = search(searchService, query);
assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
String.format("%s - Expected search results", query));
assertTrue(result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()),
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
assertEquals(result.getEntities().get(0).getEntity().toString(),
"urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)",
"Expected exact match");
}
private Stream<AnalyzeResponse.AnalyzeToken> getTokens(AnalyzeRequest request) throws IOException {
return _searchClient.indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream();
}

View File

@ -11,6 +11,7 @@ import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder;
import org.testng.annotations.Test;
@ -41,12 +42,18 @@ public class SearchQueryBuilderTest {
BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1);
assertTrue(boolPrefixQuery.should().size() > 0);
List<Pair<String, Float>> fieldWeights = boolPrefixQuery.should().stream().map(prefixQuery -> {
MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
List<Pair<String, Float>> prefixFieldWeights = boolPrefixQuery.should().stream().map(prefixQuery -> {
if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
} else {
// exact
TermQueryBuilder builder = (TermQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
}
}).collect(Collectors.toList());
assertEquals(fieldWeights, List.of(
assertEquals(prefixFieldWeights, List.of(
Pair.of("keyPart1.delimited", 10.0f)
));

View File

@ -51,7 +51,7 @@ public class SearchRequestHandlerTest {
HighlightBuilder highlightBuilder = sourceBuilder.highlighter();
List<String> fields =
highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList());
assertEquals(fields.size(), 27);
assertEquals(fields.size(), 26);
List<String> highlightableFields =
ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey",
"nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "keyPart2",

View File

@ -20,6 +20,7 @@ record CorpGroupInfo {
@Searchable = {
"fieldType": "TEXT_PARTIAL"
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
displayName: optional string

View File

@ -28,6 +28,7 @@ record CorpUserInfo includes CustomProperties {
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
displayName: optional string
@ -90,6 +91,7 @@ record CorpUserInfo includes CustomProperties {
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
fullName: optional string

View File

@ -11,7 +11,10 @@ record CorpGroupKey {
* The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub.
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL"
"fieldType": "TEXT_PARTIAL",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
name: string
}