From b17d7764e433a2adb309386893323e5c413154f1 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:07:29 -0500 Subject: [PATCH] fix(search): restore prefix phrase match on quoted search (#11444) --- docker/profiles/docker-compose.gms.yml | 2 +- docs/how/search.md | 4 +- .../fixtures/SampleDataFixtureSetupTest.java | 47 +++++++++++ .../fixtures/SampleDataFixtureTestBase.java | 47 ++++++++--- .../resources/search_config_fixture_test.yml | 84 +++++++++++++++---- .../custom/CustomSearchConfiguration.java | 2 + .../src/main/resources/search_config.yaml | 2 +- 7 files changed, 153 insertions(+), 35 deletions(-) create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureSetupTest.java diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index c9448fa34c..6e3e578050 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -125,7 +125,7 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env} environment: &datahub-gms-dev-env <<: [*datahub-dev-telemetry-env, *datahub-gms-env] - ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-/etc/datahub/search/search_config.yaml} + ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml} SKIP_ELASTICSEARCH_CHECK: false JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001' BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false diff --git a/docs/how/search.md b/docs/how/search.md index c809ab1efb..5c1ba266ee 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -85,8 +85,8 @@ These examples are non exhaustive and using Datasets as a reference. If you want to: - Exact match on term or phrase - - ```"datahub_schema"``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%22datahub_schema%22) - - ```datahub_schema``` [Sample results](https://demo.datahubproject.io/search?page=1&query=datahub_schema) + - ```"pet profile"``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%22pet%20profile%22) + - ```pet profile``` [Sample results](https://demo.datahubproject.io/search?page=1&query=pet%20profile) - Enclosing one or more terms with double quotes will enforce exact matching on these terms, preventing further tokenization. - Exclude terms diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureSetupTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureSetupTest.java new file mode 100644 index 0000000000..b908933fcc --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureSetupTest.java @@ -0,0 +1,47 @@ +package com.linkedin.metadata.search.fixtures; + +import static org.testng.AssertJUnit.assertEquals; + +import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; +import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Map; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +public class SampleDataFixtureSetupTest extends AbstractTestNGSpringContextTests { + private static final String DEFAULT_CONFIG = "search_config.yaml"; + private static final String TEST_FIXTURE_CONFIG = "search_config_fixture_test.yml"; + private static final YAMLMapper MAPPER = new YAMLMapper(); + + /** + * Ensure default search configuration matches the test fixture configuration (allowing for some + * differences) + */ + @Test + public void testConfig() throws IOException { + final CustomSearchConfiguration defaultConfig; + final CustomSearchConfiguration fixtureConfig; + + try (InputStream stream = new ClassPathResource(DEFAULT_CONFIG).getInputStream()) { + defaultConfig = MAPPER.readValue(stream, CustomSearchConfiguration.class); + } + try (InputStream stream = new ClassPathResource(TEST_FIXTURE_CONFIG).getInputStream()) { + fixtureConfig = MAPPER.readValue(stream, CustomSearchConfiguration.class); + + // test specifics + ((List>) + fixtureConfig.getQueryConfigurations().get(1).getFunctionScore().get("functions")) + .remove(1); + + ((List>) + fixtureConfig.getQueryConfigurations().get(2).getFunctionScore().get("functions")) + .remove(1); + } + + assertEquals(fixtureConfig, defaultConfig); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java index 7e434bf933..6a48dc19b0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java @@ -944,7 +944,7 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont "covid", 2, "\"raw_orders\"", - 6, + 1, STRUCTURED_QUERY_PREFIX + "sample", 3, STRUCTURED_QUERY_PREFIX + "\"sample\"", @@ -1327,24 +1327,24 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont totalResults += numResults; scrollId = result.getScrollId(); } while (scrollId != null); - // expect 8 total matching results - assertEquals(totalResults, 8); + // expect 2 total matching results + assertEquals(totalResults, 2); } @Test public void testSearchAcrossMultipleEntities() { - String query = "logging_events"; + String query = "logging events"; SearchResult result = search(getOperationContext(), getSearchService(), query); - assertEquals((int) result.getNumEntities(), 8); + assertEquals((int) result.getNumEntities(), 6); result = search( getOperationContext(), getSearchService(), List.of(DATASET_ENTITY_NAME, DATA_JOB_ENTITY_NAME), query); - assertEquals((int) result.getNumEntities(), 8); + assertEquals((int) result.getNumEntities(), 6); result = search(getOperationContext(), getSearchService(), List.of(DATASET_ENTITY_NAME), query); - assertEquals((int) result.getNumEntities(), 4); + assertEquals((int) result.getNumEntities(), 2); result = search(getOperationContext(), getSearchService(), List.of(DATA_JOB_ENTITY_NAME), query); assertEquals((int) result.getNumEntities(), 4); @@ -1706,7 +1706,7 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont assertTrue( result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), String.format("%s - Expected search results to include matched fields", query)); - assertEquals(result.getEntities().size(), 8); + assertEquals(result.getEntities().size(), 2); } @Test @@ -1729,7 +1729,7 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont assertTrue( result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), String.format("%s - Expected search results to include matched fields", query)); - assertEquals(result.getEntities().size(), 8); + assertEquals(result.getEntities().size(), 2); } @Test @@ -1755,6 +1755,27 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont assertEquals(result.getEntities().size(), 8); } + @Test + public void testQuotedPrefixDescriptionField() { + String query = "\"Constructs the fct_users_deleted\""; + SearchResult result = searchAcrossEntities(getOperationContext(), getSearchService(), query); + assertTrue( + result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertTrue( + result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), + String.format("%s - Expected search results to include matched fields", query)); + assertEquals(result.getEntities().size(), 4); + + assertTrue( + result.getEntities().stream() + .allMatch( + e -> + e.getMatchedFields().stream().anyMatch(m -> m.getName().equals("description"))), + "%s - Expected search results to match on description field based on prefix match"); + } + @Test public void testParens() { String query = "dbt | (bigquery + covid19)"; @@ -1878,7 +1899,7 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), String.format("%s - Expected search results to include matched fields", query)); - assertEquals(result.getEntities().size(), 10); + assertEquals(result.getEntities().size(), 2); assertEquals( result.getEntities().get(0).getEntity().toString(), "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", @@ -1937,9 +1958,9 @@ public abstract class SampleDataFixtureTestBase extends AbstractTestNGSpringCont result.getEntities().stream().noneMatch(e -> e.getMatchedFields().isEmpty()), String.format("%s - Expected search results to include matched fields", query)); - assertTrue( - result.getEntities().size() > 2, - String.format("%s - Expected search results to have at least two results", query)); + assertFalse( + result.getEntities().isEmpty(), + String.format("%s - Expected search results to have at least 1 result.", query)); assertEquals( result.getEntities().get(0).getEntity().toString(), "urn:li:dataset:(urn:li:dataPlatform:testOnly," + "important_units" + ",PROD)", diff --git a/metadata-io/src/test/resources/search_config_fixture_test.yml b/metadata-io/src/test/resources/search_config_fixture_test.yml index 606ce6a2f6..08e713c6b1 100644 --- a/metadata-io/src/test/resources/search_config_fixture_test.yml +++ b/metadata-io/src/test/resources/search_config_fixture_test.yml @@ -1,6 +1,7 @@ # Use for testing with search fixtures queryConfigurations: - # Select * + # Select */explore all + # Attempt to rank active incidents at the top followed by enrichment factors - queryRegex: '[*]|' simpleQuery: false prefixMatchQuery: false @@ -8,44 +9,91 @@ queryConfigurations: functionScore: functions: - filter: - match_all: {} - weight: 1 + term: + hasActiveIncidents: + value: true + weight: 2.0 - filter: term: - materialized: + hasDescription: value: true - weight: 0.5 + weight: 1.25 + - filter: + term: + hasOwners: + value: true + weight: 1.25 + - filter: + term: + hasDomain: + value: true + weight: 1.1 + - filter: + term: + hasGlossaryTerms: + value: true + weight: 1.1 + - filter: + term: + hasTags: + value: true + weight: 1.1 + - filter: + term: + hasRowCount: + value: true + weight: 1.05 + - filter: + term: + hasColumnCount: + value: true + weight: 1.05 - filter: term: deprecated: value: true - weight: 0.5 - score_mode: avg + weight: 0.25 + score_mode: multiply + boost_mode: replace + + # Criteria for exact-match only + # Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query + - queryRegex: >- + ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$ + simpleQuery: false + prefixMatchQuery: true + exactMatchQuery: true + functionScore: + functions: + - filter: + term: + deprecated: + value: true + weight: 0.25 + - filter: + terms: + tags: + - urn:li:tag:pii + weight: 1.25 + score_mode: multiply boost_mode: multiply + # default - queryRegex: .* simpleQuery: true prefixMatchQuery: true exactMatchQuery: true functionScore: functions: - - filter: - match_all: {} - weight: 1 - - filter: - term: - materialized: - value: true - weight: 0.5 - filter: term: deprecated: value: true - weight: 0.5 + weight: 0.25 - filter: terms: tags: - urn:li:tag:pii weight: 1.25 - score_mode: avg - boost_mode: multiply \ No newline at end of file + score_mode: multiply + boost_mode: multiply diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/custom/CustomSearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/custom/CustomSearchConfiguration.java index d2a908050c..4ab144f2e7 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/custom/CustomSearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/custom/CustomSearchConfiguration.java @@ -7,10 +7,12 @@ import java.util.List; import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; +import lombok.ToString; @Builder(toBuilder = true) @Getter @EqualsAndHashCode +@ToString @JsonDeserialize(builder = CustomSearchConfiguration.CustomSearchConfigurationBuilder.class) public class CustomSearchConfiguration { diff --git a/metadata-service/configuration/src/main/resources/search_config.yaml b/metadata-service/configuration/src/main/resources/search_config.yaml index 2ffe962d39..e93f8af8b1 100644 --- a/metadata-service/configuration/src/main/resources/search_config.yaml +++ b/metadata-service/configuration/src/main/resources/search_config.yaml @@ -69,7 +69,7 @@ queryConfigurations: - queryRegex: >- ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$ simpleQuery: false - prefixMatchQuery: false + prefixMatchQuery: true exactMatchQuery: true functionScore: functions: