feat(dataset): Enable search of datasets by field names (#2001)

* feat(dataset): Enable search of datasets by field names
This commit is contained in:
Nagarjuna Kanamarlapudi 2020-11-20 12:01:07 -08:00 committed by GitHub
parent 6f59a91865
commit 5d083143db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 77 additions and 10 deletions

View File

@ -57,6 +57,15 @@ export const fields: Array<ISearchEntityRenderProps> = [
desc: 'The platform of the dataset',
example: 'platform:kafka'
},
{
showInAutoCompletion: true,
fieldName: 'fieldPaths',
showInResultsPreview: false,
displayName: 'fieldPaths',
showInFacets: true,
desc: 'Fields of the dataset',
example: 'fieldPaths:field1'
},
{
showInAutoCompletion: false,
fieldName: 'healthScore',

View File

@ -52,6 +52,13 @@
"type": "custom",
"tokenizer": "dataset_pattern"
},
"field_pattern": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "field_pattern"
},
"comma_pattern": {
"filter": [
"lowercase"
@ -103,6 +110,14 @@
"type": "custom",
"tokenizer": "dataset_pattern"
},
"field_pattern_ngram": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "field_pattern"
},
"custom_browse_slash": {
"filter": [
"lowercase"
@ -133,6 +148,10 @@
"dataset_pattern": {
"pattern": "[./]",
"type": "pattern"
},
"field_pattern": {
"pattern": "[./]",
"type": "pattern"
}
}
}
@ -171,7 +190,8 @@
"keyword": {
"type": "keyword"
}
}
},
},
"hasOwners": {
"type": "boolean"
@ -201,6 +221,28 @@
},
"normalizer": "my_normalizer"
},
"fieldPaths": {
"type": "keyword",
"fields": {
"field_pattern_ngram": {
"type": "text",
"analyzer": "field_pattern_ngram"
},
"delimited": {
"type": "text",
"analyzer": "delimit"
},
"ngram": {
"type": "text",
"analyzer": "custom_ngram"
},
"pattern": {
"type": "text",
"analyzer": "field_pattern"
}
},
"normalizer": "my_normalizer"
},
"num_downstream_datasets": {
"type": "long"
},

View File

@ -11,14 +11,14 @@ import com.linkedin.dataset.DatasetProperties;
import com.linkedin.dataset.UpstreamLineage;
import com.linkedin.metadata.search.DatasetDocument;
import com.linkedin.metadata.snapshot.DatasetSnapshot;
import com.linkedin.schema.SchemaField;
import com.linkedin.schema.SchemaMetadata;
import lombok.extern.slf4j.Slf4j;
import javax.annotation.Nonnull;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DatasetIndexBuilder extends BaseIndexBuilder<DatasetDocument> {
@ -79,10 +79,11 @@ public class DatasetIndexBuilder extends BaseIndexBuilder<DatasetDocument> {
}
@Nonnull
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn, @Nonnull SchemaMetadata schemaMetadata) {
return new DatasetDocument()
.setUrn(urn)
.setHasSchema(true);
private DatasetDocument getDocumentToUpdateFromAspect(@Nonnull DatasetUrn urn,
@Nonnull SchemaMetadata schemaMetadata) {
final StringArray fieldPaths = new StringArray(
schemaMetadata.getFields().stream().map(SchemaField::getFieldPath).collect(Collectors.toList()));
return new DatasetDocument().setUrn(urn).setHasSchema(true).setFieldPaths(fieldPaths);
}
@Nonnull

View File

@ -23,6 +23,10 @@ import com.linkedin.metadata.aspect.DatasetAspect;
import com.linkedin.metadata.dao.utils.ModelUtils;
import com.linkedin.metadata.search.DatasetDocument;
import com.linkedin.metadata.snapshot.DatasetSnapshot;
import com.linkedin.schema.BooleanType;
import com.linkedin.schema.SchemaField;
import com.linkedin.schema.SchemaFieldArray;
import com.linkedin.schema.SchemaFieldDataType;
import com.linkedin.schema.SchemaMetadata;
import java.util.Collections;
import java.util.List;
@ -178,11 +182,17 @@ public class DatasetIndexBuilderTest {
@Test
public void schemaMetadata() {
// given
final SchemaFieldArray schemaFieldArray = new SchemaFieldArray(new SchemaField().setFieldPath("foo.bar.baz")
.setType(new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType())))
.setNullable(false)
.setNativeDataType("boolean")
.setRecursive(false));
final DatasetUrn datasetUrn = new DatasetUrn(new DataPlatformUrn("foo"), "bar", FabricType.PROD);
final SchemaMetadata schemaMetadata = new SchemaMetadata();
final SchemaMetadata schemaMetadata = new SchemaMetadata().setFields(schemaFieldArray);
final DatasetSnapshot datasetSnapshot = ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn,
Collections.singletonList(ModelUtils.newAspectUnion(DatasetAspect.class, schemaMetadata)));
final DatasetDocument expectedDocument1 = new DatasetDocument().setUrn(datasetUrn).setHasSchema(true);
final DatasetDocument expectedDocument1 =
new DatasetDocument().setUrn(datasetUrn).setHasSchema(true).setFieldPaths(new StringArray("foo.bar.baz"));
final DatasetDocument expectedDocument2 = new DatasetDocument().setUrn(datasetUrn)
.setBrowsePaths(new StringArray("/prod/foo/bar"))
.setOrigin(FabricType.PROD)

View File

@ -43,6 +43,11 @@ record DatasetDocument includes BaseDocument {
*/
description: optional string
/**
* Field paths of the dataset
*/
fieldPaths: optional array[string]
/**
* Flag to indicate if the dataset has non empty corp users as owners or not.
*/