fix(search): fix regression from #10932 (#11309)

This commit is contained in:
david-leifker 2024-09-05 14:09:13 -05:00 committed by GitHub
parent d788cd753c
commit d795a5357d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 260 additions and 202 deletions

View File

@ -19,10 +19,6 @@ import com.linkedin.schema.StringType;
import datahub.protobuf.ProtobufUtils; import datahub.protobuf.ProtobufUtils;
import datahub.protobuf.visitors.ProtobufModelVisitor; import datahub.protobuf.visitors.ProtobufModelVisitor;
import datahub.protobuf.visitors.VisitContext; import datahub.protobuf.visitors.VisitContext;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
@ -31,7 +27,9 @@ import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
@Builder(toBuilder = true) @Builder(toBuilder = true)
@Getter @Getter
@ -87,7 +85,9 @@ public class ProtobufField implements ProtobufElement {
@Override @Override
public String nativeType() { public String nativeType() {
return Optional.ofNullable(nativeType).orElseGet(() -> { return Optional.ofNullable(nativeType)
.orElseGet(
() -> {
if (fieldProto.getTypeName().isEmpty()) { if (fieldProto.getTypeName().isEmpty()) {
return fieldProto.getType().name().split("_")[1].toLowerCase(); return fieldProto.getType().name().split("_")[1].toLowerCase();
} else { } else {
@ -98,7 +98,9 @@ public class ProtobufField implements ProtobufElement {
@Override @Override
public String fieldPathType() { public String fieldPathType() {
return Optional.ofNullable(fieldPathType).orElseGet(() -> { return Optional.ofNullable(fieldPathType)
.orElseGet(
() -> {
final String pathType; final String pathType;
switch (fieldProto.getType()) { switch (fieldProto.getType()) {
@ -140,7 +142,9 @@ public class ProtobufField implements ProtobufElement {
break; break;
default: default:
throw new IllegalStateException( throw new IllegalStateException(
String.format("Unexpected FieldDescriptorProto => FieldPathType %s", fieldProto.getType())); String.format(
"Unexpected FieldDescriptorProto => FieldPathType %s",
fieldProto.getType()));
} }
StringArray fieldPath = new StringArray(); StringArray fieldPath = new StringArray();
@ -165,7 +169,9 @@ public class ProtobufField implements ProtobufElement {
} }
public SchemaFieldDataType schemaFieldDataType() throws IllegalStateException { public SchemaFieldDataType schemaFieldDataType() throws IllegalStateException {
return Optional.ofNullable(schemaFieldDataType).orElseGet(() -> { return Optional.ofNullable(schemaFieldDataType)
.orElseGet(
() -> {
final SchemaFieldDataType.Type fieldType; final SchemaFieldDataType.Type fieldType;
switch (fieldProto.getType()) { switch (fieldProto.getType()) {
@ -203,12 +209,16 @@ public class ProtobufField implements ProtobufElement {
break; break;
default: default:
throw new IllegalStateException( throw new IllegalStateException(
String.format("Unexpected FieldDescriptorProto => SchemaFieldDataType: %s", fieldProto.getType())); String.format(
"Unexpected FieldDescriptorProto => SchemaFieldDataType: %s",
fieldProto.getType()));
} }
if (fieldProto.getLabel().equals(FieldDescriptorProto.Label.LABEL_REPEATED)) { if (fieldProto.getLabel().equals(FieldDescriptorProto.Label.LABEL_REPEATED)) {
return new SchemaFieldDataType().setType( return new SchemaFieldDataType()
SchemaFieldDataType.Type.create(new ArrayType().setNestedType(new StringArray()))); .setType(
SchemaFieldDataType.Type.create(
new ArrayType().setNestedType(new StringArray())));
} }
return new SchemaFieldDataType().setType(fieldType); return new SchemaFieldDataType().setType(fieldType);
@ -219,22 +229,31 @@ public class ProtobufField implements ProtobufElement {
public Stream<SourceCodeInfo.Location> messageLocations() { public Stream<SourceCodeInfo.Location> messageLocations() {
List<SourceCodeInfo.Location> fileLocations = fileProto().getSourceCodeInfo().getLocationList(); List<SourceCodeInfo.Location> fileLocations = fileProto().getSourceCodeInfo().getLocationList();
return fileLocations.stream() return fileLocations.stream()
.filter(loc -> loc.getPathCount() > 1 && loc.getPath(0) == FileDescriptorProto.MESSAGE_TYPE_FIELD_NUMBER); .filter(
loc ->
loc.getPathCount() > 1
&& loc.getPath(0) == FileDescriptorProto.MESSAGE_TYPE_FIELD_NUMBER);
} }
@Override @Override
public String comment() { public String comment() {
return messageLocations().filter(location -> location.getPathCount() > 3) return messageLocations()
.filter(location -> !ProtobufUtils.collapseLocationComments(location).isEmpty() && !isEnumType( .filter(location -> location.getPathCount() > 3)
location.getPathList())) .filter(
.filter(location -> { location ->
!ProtobufUtils.collapseLocationComments(location).isEmpty()
&& !isEnumType(location.getPathList()))
.filter(
location -> {
List<Integer> pathList = location.getPathList(); List<Integer> pathList = location.getPathList();
DescriptorProto messageType = fileProto().getMessageType(pathList.get(1)); DescriptorProto messageType = fileProto().getMessageType(pathList.get(1));
if (!isNestedType && location.getPath(2) == DescriptorProto.FIELD_FIELD_NUMBER if (!isNestedType
&& location.getPath(2) == DescriptorProto.FIELD_FIELD_NUMBER
&& fieldProto == messageType.getField(location.getPath(3))) { && fieldProto == messageType.getField(location.getPath(3))) {
return true; return true;
} else if (isNestedType && location.getPath(2) == DescriptorProto.NESTED_TYPE_FIELD_NUMBER } else if (isNestedType
&& location.getPath(2) == DescriptorProto.NESTED_TYPE_FIELD_NUMBER
&& fieldProto == getNestedTypeFields(pathList, messageType)) { && fieldProto == getNestedTypeFields(pathList, messageType)) {
return true; return true;
} }
@ -245,12 +264,15 @@ public class ProtobufField implements ProtobufElement {
.trim(); .trim();
} }
private FieldDescriptorProto getNestedTypeFields(List<Integer> pathList, DescriptorProto messageType) { private FieldDescriptorProto getNestedTypeFields(
List<Integer> pathList, DescriptorProto messageType) {
int pathSize = pathList.size(); int pathSize = pathList.size();
List<Integer> nestedValues = new ArrayList<>(pathSize); List<Integer> nestedValues = new ArrayList<>(pathSize);
for (int index = 0; index < pathSize; index++) { for (int index = 0; index < pathSize; index++) {
if (index > 1 && index % 2 == 0 && pathList.get(index) == DescriptorProto.NESTED_TYPE_FIELD_NUMBER) { if (index > 1
&& index % 2 == 0
&& pathList.get(index) == DescriptorProto.NESTED_TYPE_FIELD_NUMBER) {
nestedValues.add(pathList.get(index + 1)); nestedValues.add(pathList.get(index + 1));
} }
} }
@ -260,7 +282,9 @@ public class ProtobufField implements ProtobufElement {
} }
int fieldIndex = pathList.get(pathList.size() - 1); int fieldIndex = pathList.get(pathList.size() - 1);
if (isFieldPath(pathList) && pathSize % 2 == 0 && fieldIndex < messageType.getFieldList().size()) { if (isFieldPath(pathList)
&& pathSize % 2 == 0
&& fieldIndex < messageType.getFieldList().size()) {
return messageType.getField(fieldIndex); return messageType.getField(fieldIndex);
} }
@ -273,7 +297,9 @@ public class ProtobufField implements ProtobufElement {
private boolean isEnumType(List<Integer> pathList) { private boolean isEnumType(List<Integer> pathList) {
for (int index = 0; index < pathList.size(); index++) { for (int index = 0; index < pathList.size(); index++) {
if (index > 1 && index % 2 == 0 && pathList.get(index) == DescriptorProto.ENUM_TYPE_FIELD_NUMBER) { if (index > 1
&& index % 2 == 0
&& pathList.get(index) == DescriptorProto.ENUM_TYPE_FIELD_NUMBER) {
return true; return true;
} }
} }
@ -327,7 +353,9 @@ public class ProtobufField implements ProtobufElement {
} }
public List<DescriptorProtos.EnumValueDescriptorProto> getEnumValues() { public List<DescriptorProtos.EnumValueDescriptorProto> getEnumValues() {
return getEnumDescriptor().map(DescriptorProtos.EnumDescriptorProto::getValueList).orElse(Collections.emptyList()); return getEnumDescriptor()
.map(DescriptorProtos.EnumDescriptorProto::getValueList)
.orElse(Collections.emptyList());
} }
public Map<String, String> getEnumValuesWithComments() { public Map<String, String> getEnumValuesWithComments() {
@ -347,7 +375,8 @@ public class ProtobufField implements ProtobufElement {
for (int i = 0; i < values.size(); i++) { for (int i = 0; i < values.size(); i++) {
DescriptorProtos.EnumValueDescriptorProto value = values.get(i); DescriptorProtos.EnumValueDescriptorProto value = values.get(i);
int finalI = i; int finalI = i;
String comment = locations.stream() String comment =
locations.stream()
.filter(loc -> isEnumValueLocation(loc, enumIndex, finalI)) .filter(loc -> isEnumValueLocation(loc, enumIndex, finalI))
.findFirst() .findFirst()
.map(ProtobufUtils::collapseLocationComments) .map(ProtobufUtils::collapseLocationComments)
@ -359,8 +388,8 @@ public class ProtobufField implements ProtobufElement {
return valueComments; return valueComments;
} }
private boolean isEnumValueLocation(DescriptorProtos.SourceCodeInfo.Location location, int enumIndex, private boolean isEnumValueLocation(
int valueIndex) { DescriptorProtos.SourceCodeInfo.Location location, int enumIndex, int valueIndex) {
return location.getPathCount() > 3 return location.getPathCount() > 3
&& location.getPath(0) == DescriptorProtos.FileDescriptorProto.ENUM_TYPE_FIELD_NUMBER && location.getPath(0) == DescriptorProtos.FileDescriptorProto.ENUM_TYPE_FIELD_NUMBER
&& location.getPath(1) == enumIndex && location.getPath(1) == enumIndex

View File

@ -1,5 +1,8 @@
package datahub.protobuf.visitors.field; package datahub.protobuf.visitors.field;
import static datahub.protobuf.ProtobufUtils.getFieldOptions;
import static datahub.protobuf.ProtobufUtils.getMessageOptions;
import com.linkedin.common.GlobalTags; import com.linkedin.common.GlobalTags;
import com.linkedin.common.GlossaryTermAssociation; import com.linkedin.common.GlossaryTermAssociation;
import com.linkedin.common.GlossaryTermAssociationArray; import com.linkedin.common.GlossaryTermAssociationArray;
@ -15,47 +18,54 @@ import datahub.protobuf.model.ProtobufElement;
import datahub.protobuf.model.ProtobufField; import datahub.protobuf.model.ProtobufField;
import datahub.protobuf.visitors.ProtobufExtensionUtil; import datahub.protobuf.visitors.ProtobufExtensionUtil;
import datahub.protobuf.visitors.VisitContext; import datahub.protobuf.visitors.VisitContext;
import org.jgrapht.GraphPath;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.jgrapht.GraphPath;
import static datahub.protobuf.ProtobufUtils.getFieldOptions;
import static datahub.protobuf.ProtobufUtils.getMessageOptions;
public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor { public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
@Override @Override
public Stream<Pair<SchemaField, Double>> visitField(ProtobufField field, VisitContext context) { public Stream<Pair<SchemaField, Double>> visitField(ProtobufField field, VisitContext context) {
boolean isPrimaryKey = getFieldOptions(field.getFieldProto()).stream() boolean isPrimaryKey =
getFieldOptions(field.getFieldProto()).stream()
.map(Pair::getKey) .map(Pair::getKey)
.anyMatch(fieldDesc -> fieldDesc.getName().matches("(?i).*primary_?key")); .anyMatch(fieldDesc -> fieldDesc.getName().matches("(?i).*primary_?key"));
List<TagAssociation> tags = getTagAssociations(field, context); List<TagAssociation> tags = getTagAssociations(field, context);
List<GlossaryTermAssociation> terms = getGlossaryTermAssociations(field, context); List<GlossaryTermAssociation> terms = getGlossaryTermAssociations(field, context);
return context.streamAllPaths(field) return context
.map(path -> Pair.of(createSchemaField(field, context, path, isPrimaryKey, tags, terms), .streamAllPaths(field)
.map(
path ->
Pair.of(
createSchemaField(field, context, path, isPrimaryKey, tags, terms),
context.calculateSortOrder(path, field))); context.calculateSortOrder(path, field)));
} }
private SchemaField createSchemaField(ProtobufField field, VisitContext context, private SchemaField createSchemaField(
GraphPath<ProtobufElement, FieldTypeEdge> path, boolean isPrimaryKey, List<TagAssociation> tags, ProtobufField field,
VisitContext context,
GraphPath<ProtobufElement, FieldTypeEdge> path,
boolean isPrimaryKey,
List<TagAssociation> tags,
List<GlossaryTermAssociation> terms) { List<GlossaryTermAssociation> terms) {
String description = createFieldDescription(field); String description = createFieldDescription(field);
return new SchemaField().setFieldPath(context.getFieldPath(path)) return new SchemaField()
.setFieldPath(context.getFieldPath(path))
.setNullable(!isPrimaryKey) .setNullable(!isPrimaryKey)
.setIsPartOfKey(isPrimaryKey) .setIsPartOfKey(isPrimaryKey)
.setDescription(description) .setDescription(description)
.setNativeDataType(field.nativeType()) .setNativeDataType(field.nativeType())
.setType(field.schemaFieldDataType()) .setType(field.schemaFieldDataType())
.setGlobalTags(new GlobalTags().setTags(new TagAssociationArray(tags))) .setGlobalTags(new GlobalTags().setTags(new TagAssociationArray(tags)))
.setGlossaryTerms(new GlossaryTerms().setTerms(new GlossaryTermAssociationArray(terms)) .setGlossaryTerms(
new GlossaryTerms()
.setTerms(new GlossaryTermAssociationArray(terms))
.setAuditStamp(context.getAuditStamp())); .setAuditStamp(context.getAuditStamp()));
} }
@ -73,10 +83,15 @@ public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
return description.toString(); return description.toString();
} }
private void appendEnumValues(StringBuilder description, ProtobufField field, private void appendEnumValues(
Map<String, String> enumValuesWithComments) { StringBuilder description, ProtobufField field, Map<String, String> enumValuesWithComments) {
enumValuesWithComments.forEach((name, comment) -> { enumValuesWithComments.forEach(
field.getEnumValues().stream().filter(v -> v.getName().equals(name)).findFirst().ifPresent(value -> { (name, comment) -> {
field.getEnumValues().stream()
.filter(v -> v.getName().equals(name))
.findFirst()
.ifPresent(
value -> {
description.append(String.format("%d: %s", value.getNumber(), name)); description.append(String.format("%d: %s", value.getNumber(), name));
if (!comment.isEmpty()) { if (!comment.isEmpty()) {
description.append(" - ").append(comment); description.append(" - ").append(comment);
@ -88,11 +103,13 @@ public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
private List<TagAssociation> getTagAssociations(ProtobufField field, VisitContext context) { private List<TagAssociation> getTagAssociations(ProtobufField field, VisitContext context) {
Stream<TagAssociation> fieldTags = Stream<TagAssociation> fieldTags =
ProtobufExtensionUtil.extractTagPropertiesFromOptions(getFieldOptions(field.getFieldProto()), ProtobufExtensionUtil.extractTagPropertiesFromOptions(
context.getGraph().getRegistry()).map(tag -> new TagAssociation().setTag(new TagUrn(tag.getName()))); getFieldOptions(field.getFieldProto()), context.getGraph().getRegistry())
.map(tag -> new TagAssociation().setTag(new TagUrn(tag.getName())));
Stream<TagAssociation> promotedTags = Stream<TagAssociation> promotedTags =
promotedTags(field, context).map(tag -> new TagAssociation().setTag(new TagUrn(tag.getName()))); promotedTags(field, context)
.map(tag -> new TagAssociation().setTag(new TagUrn(tag.getName())));
return Stream.concat(fieldTags, promotedTags) return Stream.concat(fieldTags, promotedTags)
.distinct() .distinct()
@ -100,10 +117,11 @@ public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
private List<GlossaryTermAssociation> getGlossaryTermAssociations(ProtobufField field, VisitContext context) { private List<GlossaryTermAssociation> getGlossaryTermAssociations(
ProtobufField field, VisitContext context) {
Stream<GlossaryTermAssociation> fieldTerms = Stream<GlossaryTermAssociation> fieldTerms =
ProtobufExtensionUtil.extractTermAssociationsFromOptions(getFieldOptions(field.getFieldProto()), ProtobufExtensionUtil.extractTermAssociationsFromOptions(
context.getGraph().getRegistry()); getFieldOptions(field.getFieldProto()), context.getGraph().getRegistry());
Stream<GlossaryTermAssociation> promotedTerms = promotedTerms(field, context); Stream<GlossaryTermAssociation> promotedTerms = promotedTerms(field, context);
@ -120,11 +138,12 @@ public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
*/ */
private Stream<TagProperties> promotedTags(ProtobufField field, VisitContext context) { private Stream<TagProperties> promotedTags(ProtobufField field, VisitContext context) {
if (field.isMessage()) { if (field.isMessage()) {
return context.getGraph() return context.getGraph().outgoingEdgesOf(field).stream()
.outgoingEdgesOf(field) .flatMap(
.stream() e ->
.flatMap(e -> ProtobufExtensionUtil.extractTagPropertiesFromOptions( ProtobufExtensionUtil.extractTagPropertiesFromOptions(
getMessageOptions(e.getEdgeTarget().messageProto()), context.getGraph().getRegistry())) getMessageOptions(e.getEdgeTarget().messageProto()),
context.getGraph().getRegistry()))
.distinct(); .distinct();
} else { } else {
return Stream.of(); return Stream.of();
@ -138,11 +157,12 @@ public class ProtobufExtensionFieldVisitor extends SchemaFieldVisitor {
*/ */
private Stream<GlossaryTermAssociation> promotedTerms(ProtobufField field, VisitContext context) { private Stream<GlossaryTermAssociation> promotedTerms(ProtobufField field, VisitContext context) {
if (field.isMessage()) { if (field.isMessage()) {
return context.getGraph() return context.getGraph().outgoingEdgesOf(field).stream()
.outgoingEdgesOf(field) .flatMap(
.stream() e ->
.flatMap(e -> ProtobufExtensionUtil.extractTermAssociationsFromOptions( ProtobufExtensionUtil.extractTermAssociationsFromOptions(
getMessageOptions(e.getEdgeTarget().messageProto()), context.getGraph().getRegistry())) getMessageOptions(e.getEdgeTarget().messageProto()),
context.getGraph().getRegistry()))
.distinct(); .distinct();
} else { } else {
return Stream.of(); return Stream.of();

View File

@ -48,7 +48,8 @@ public class ProtobufUtilsTest {
@Test @Test
public void testCollapseLocationCommentsWithUTF8() { public void testCollapseLocationCommentsWithUTF8() {
DescriptorProtos.SourceCodeInfo.Location location = DescriptorProtos.SourceCodeInfo.Location.newBuilder() DescriptorProtos.SourceCodeInfo.Location location =
DescriptorProtos.SourceCodeInfo.Location.newBuilder()
.addAllLeadingDetachedComments(Arrays.asList("/* Emoji 😊 */", "/* Accented é */")) .addAllLeadingDetachedComments(Arrays.asList("/* Emoji 😊 */", "/* Accented é */"))
.setLeadingComments("/* Chinese 你好 */\n// Russian Привет") .setLeadingComments("/* Chinese 你好 */\n// Russian Привет")
.setTrailingComments("// Korean 안녕") .setTrailingComments("// Korean 안녕")

View File

@ -361,14 +361,22 @@ public class ProtobufFieldTest {
ProtobufDataset test = getTestProtobufDataset("extended_protobuf", "messageE"); ProtobufDataset test = getTestProtobufDataset("extended_protobuf", "messageE");
SchemaMetadata testMetadata = test.getSchemaMetadata(); SchemaMetadata testMetadata = test.getSchemaMetadata();
SchemaField timestampField = testMetadata.getFields() SchemaField timestampField =
.stream() testMetadata.getFields().stream()
.filter(v -> v.getFieldPath() .filter(
.equals("[version=2.0].[type=extended_protobuf_TimestampUnitMessage].[type=enum].timestamp_unit_type")) v ->
v.getFieldPath()
.equals(
"[version=2.0].[type=extended_protobuf_TimestampUnitMessage].[type=enum].timestamp_unit_type"))
.findFirst() .findFirst()
.orElseThrow(); .orElseThrow();
assertEquals("timestamp unit\n" + "\n" + "0: MILLISECOND - 10^-3 seconds\n" + "1: MICROSECOND - 10^-6 seconds\n" assertEquals(
+ "2: NANOSECOND - 10^-9 seconds\n", timestampField.getDescription()); "timestamp unit\n"
+ "\n"
+ "0: MILLISECOND - 10^-3 seconds\n"
+ "1: MICROSECOND - 10^-6 seconds\n"
+ "2: NANOSECOND - 10^-9 seconds\n",
timestampField.getDescription());
} }
} }

View File

@ -52,7 +52,7 @@ import org.opensearch.index.query.functionscore.ScoreFunctionBuilders;
@Slf4j @Slf4j
public class SearchQueryBuilder { public class SearchQueryBuilder {
public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q "; public static final String STRUCTURED_QUERY_PREFIX = "\\/q ";
private final ExactMatchConfiguration exactMatchConfiguration; private final ExactMatchConfiguration exactMatchConfiguration;
private final PartialConfiguration partialConfiguration; private final PartialConfiguration partialConfiguration;
private final WordGramConfiguration wordGramConfiguration; private final WordGramConfiguration wordGramConfiguration;