feat(config): add configurable search filter min length (#13499)

This commit is contained in:
RyanHolstien 2025-05-13 12:37:59 -05:00 committed by GitHub
parent bc860181d8
commit dd377b33dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 36 additions and 11 deletions

View File

@ -44,6 +44,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- SYNC_PRIMARY: Synchronously updates the primary storage (SQL) but asynchronously updates search storage (Elasticsearch). Provides a balance between consistency and performance. Suitable for updates that need to be immediately reflected in direct entity retrievals but where search index consistency can be slightly delayed.
- ASYNC: Queues the metadata change for asynchronous processing and returns immediately. The client continues execution without waiting for the change to be fully processed. Best for high-throughput scenarios where eventual consistency is acceptable.
- ASYNC_WAIT: Queues the metadata change asynchronously but blocks until confirmation that the write has been fully persisted. More efficient than fully synchronous operations due to backend parallelization and batching while still providing strong consistency guarantees. Useful when you need confirmation of successful persistence without sacrificing performance.
- #13499 - Added ELASTICSEARCH_MIN_SEARCH_FILTER_LENGTH configuration for ElasticSearch index config. If modified from the default, this configuration can have significant impact on search performance if changed and will trigger reindexing causing large delays in updates. Most users will not want to modify this.
## 1.0.0

View File

@ -2,6 +2,7 @@ package com.linkedin.metadata.search.elasticsearch.indexbuilder;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.linkedin.metadata.config.search.IndexConfiguration;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -160,9 +161,11 @@ public class SettingsBuilder {
ImmutableList.of(ASCII_FOLDING, LOWERCASE, TRIM, REMOVE_QUOTES);
public final Map<String, Object> settings;
private final IndexConfiguration indexConfiguration;
public SettingsBuilder(String mainTokenizer) {
public SettingsBuilder(String mainTokenizer, IndexConfiguration indexConfiguration) {
try {
this.indexConfiguration = indexConfiguration;
settings = buildSettings(mainTokenizer);
} catch (IOException e) {
throw new RuntimeException(e);
@ -173,7 +176,7 @@ public class SettingsBuilder {
return settings;
}
private static Map<String, Object> buildSettings(String mainTokenizer) throws IOException {
private Map<String, Object> buildSettings(String mainTokenizer) throws IOException {
ImmutableMap.Builder<String, Object> settings = ImmutableMap.builder();
settings.put(MAX_NGRAM_DIFF, 17);
settings.put(
@ -187,7 +190,7 @@ public class SettingsBuilder {
return settings.build();
}
private static Map<String, Object> buildFilters() throws IOException {
private Map<String, Object> buildFilters() throws IOException {
PathMatchingResourcePatternResolver resourceResolver =
new PathMatchingResourcePatternResolver();
@ -225,7 +228,10 @@ public class SettingsBuilder {
filters.put(
MIN_LENGTH,
ImmutableMap.<String, Object>builder().put(TYPE, "length").put("min", "3").build());
ImmutableMap.<String, Object>builder()
.put(TYPE, "length")
.put("min", this.indexConfiguration.getMinSearchFilterLength())
.build());
Resource stemOverride =
resourceResolver.getResource("classpath:elasticsearch/stem_override.txt");

View File

@ -34,6 +34,7 @@ import com.linkedin.metadata.config.cache.CacheConfiguration;
import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration;
import com.linkedin.metadata.config.cache.SearchCacheConfiguration;
import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration;
import com.linkedin.metadata.config.search.IndexConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.graph.EntityLineageResult;
import com.linkedin.metadata.graph.GraphService;
@ -129,7 +130,9 @@ public abstract class LineageServiceTestBase extends AbstractTestNGSpringContext
.hashIdAlgo("MD5")
.build()))
.asSession(RequestContext.TEST, Authorizer.EMPTY, TestOperationContexts.TEST_USER_AUTH);
settingsBuilder = new SettingsBuilder(null);
IndexConfiguration indexConfiguration = new IndexConfiguration();
indexConfiguration.setMinSearchFilterLength(3);
settingsBuilder = new SettingsBuilder(null, indexConfiguration);
elasticSearchService = buildEntitySearchService();
elasticSearchService.reindexAll(Collections.emptySet());
cacheManager = new ConcurrentMapCacheManager();

View File

@ -13,6 +13,7 @@ import com.google.common.collect.ImmutableList;
import com.linkedin.common.urn.TestEntityUrn;
import com.linkedin.common.urn.Urn;
import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration;
import com.linkedin.metadata.config.search.IndexConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.models.registry.SnapshotEntityRegistry;
import com.linkedin.metadata.query.filter.Condition;
@ -83,7 +84,9 @@ public abstract class SearchServiceTestBase extends AbstractTestNGSpringContextT
.build()))
.asSession(RequestContext.TEST, Authorizer.EMPTY, TestOperationContexts.TEST_USER_AUTH);
settingsBuilder = new SettingsBuilder(null);
IndexConfiguration indexConfiguration = new IndexConfiguration();
indexConfiguration.setMinSearchFilterLength(3);
settingsBuilder = new SettingsBuilder(null, indexConfiguration);
elasticSearchService = buildEntitySearchService();
elasticSearchService.reindexAll(Collections.emptySet());
cacheManager = new ConcurrentMapCacheManager();

View File

@ -12,6 +12,7 @@ import com.linkedin.common.urn.TestEntityUrn;
import com.linkedin.common.urn.Urn;
import com.linkedin.metadata.browse.BrowseResult;
import com.linkedin.metadata.browse.BrowseResultV2;
import com.linkedin.metadata.config.search.IndexConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.models.registry.SnapshotEntityRegistry;
import com.linkedin.metadata.search.elasticsearch.ElasticSearchService;
@ -68,7 +69,9 @@ public abstract class TestEntityTestBase extends AbstractTestNGSpringContextTest
.prefix("es_service_test")
.hashIdAlgo("MD5")
.build()));
settingsBuilder = new SettingsBuilder(null);
IndexConfiguration indexConfiguration = new IndexConfiguration();
indexConfiguration.setMinSearchFilterLength(3);
settingsBuilder = new SettingsBuilder(null, indexConfiguration);
elasticSearchService = buildService();
elasticSearchService.reindexAll(Collections.emptySet());
}

View File

@ -14,6 +14,7 @@ import com.linkedin.metadata.client.JavaEntityClient;
import com.linkedin.metadata.config.PreProcessHooks;
import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration;
import com.linkedin.metadata.config.search.ElasticSearchConfiguration;
import com.linkedin.metadata.config.search.IndexConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.entity.AspectDao;
@ -166,7 +167,9 @@ public class SampleDataFixtureConfiguration {
false,
new ElasticSearchConfiguration(),
gitVersion);
SettingsBuilder settingsBuilder = new SettingsBuilder(null);
IndexConfiguration indexConfiguration = new IndexConfiguration();
indexConfiguration.setMinSearchFilterLength(3);
SettingsBuilder settingsBuilder = new SettingsBuilder(null, indexConfiguration);
return new EntityIndexBuilders(
indexBuilder,
opContext.getEntityRegistry(),

View File

@ -14,6 +14,7 @@ import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration;
import com.linkedin.metadata.config.cache.SearchCacheConfiguration;
import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration;
import com.linkedin.metadata.config.search.ElasticSearchConfiguration;
import com.linkedin.metadata.config.search.IndexConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.entity.EntityServiceImpl;
@ -126,7 +127,9 @@ public class SearchLineageFixtureConfiguration {
false,
new ElasticSearchConfiguration(),
gitVersion);
SettingsBuilder settingsBuilder = new SettingsBuilder(null);
IndexConfiguration indexConfiguration = new IndexConfiguration();
indexConfiguration.setMinSearchFilterLength(3);
SettingsBuilder settingsBuilder = new SettingsBuilder(null, indexConfiguration);
return new EntityIndexBuilders(
indexBuilder,
opContext.getEntityRegistry(),

View File

@ -6,4 +6,5 @@ import lombok.Data;
public class IndexConfiguration {
private String prefix;
private DocIdsConfiguration docIds;
private Integer minSearchFilterLength;
}

View File

@ -243,6 +243,7 @@ elasticsearch:
enableSettingsReindex: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX:false}
maxReindexHours: ${ELASTICSEARCH_INDEX_BUILDER_MAX_REINDEX_HOURS:0} # <= 0 - no timeout
settingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_OVERRIDES:#{null}}
minSearchFilterLength: ${ELASTICSEARCH_MIN_SEARCH_FILTER_LENGTH:3}
entitySettingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_ENTITY_SETTINGS_OVERRIDES:#{null}}
docIds:
schemaField:

View File

@ -1,5 +1,6 @@
package com.linkedin.gms.factory.search;
import com.linkedin.gms.factory.config.ConfigurationProvider;
import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory;
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder;
@ -21,7 +22,7 @@ public class SettingsBuilderFactory {
private String mainTokenizer;
@Bean("settingsBuilder")
protected SettingsBuilder getInstance() {
return new SettingsBuilder(mainTokenizer);
protected SettingsBuilder getInstance(ConfigurationProvider configProvider) {
return new SettingsBuilder(mainTokenizer, configProvider.getElasticSearch().getIndex());
}
}