mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-27 01:48:24 +00:00
add aspects to VALUE model of datasets (#1940)
This commit is contained in:
parent
e936e2b856
commit
4bfcb4b508
@ -56,8 +56,8 @@ public class DatasetUtil {
|
||||
if (dataset.hasPlatformNativeType()) {
|
||||
view.setNativeType(dataset.getPlatformNativeType().name());
|
||||
}
|
||||
if (dataset.hasRemoved()) {
|
||||
view.setRemoved(dataset.isRemoved());
|
||||
if (dataset.getStatus() != null) {
|
||||
view.setRemoved(dataset.getStatus().isRemoved());
|
||||
}
|
||||
if (dataset.hasDeprecation()) {
|
||||
view.setDeprecated(dataset.getDeprecation().isDeprecated());
|
||||
|
||||
@ -2,8 +2,12 @@ namespace com.linkedin.dataset
|
||||
|
||||
import com.linkedin.common.ChangeAuditStamps
|
||||
import com.linkedin.common.DatasetUrn
|
||||
import com.linkedin.common.InstitutionalMemory
|
||||
import com.linkedin.common.Ownership
|
||||
import com.linkedin.common.Status
|
||||
import com.linkedin.common.Uri
|
||||
import com.linkedin.common.VersionTag
|
||||
import com.linkedin.schema.SchemaMetadata
|
||||
|
||||
/**
|
||||
* Dataset spec for a data store. A collection of data conforming to a single schema that can evolve over time. This is equivalent to a Table in most data platforms. Espresso dataset: Identity.Profile; oracle dataset: member2.member_profile; hdfs dataset: /data/databases/JOBS/JOB_APPLICATIONS; kafka: PageViewEvent
|
||||
@ -74,10 +78,37 @@ record Dataset includes DatasetKey, ChangeAuditStamps, VersionTag {
|
||||
/**
|
||||
* whether the dataset is removed or not
|
||||
*/
|
||||
@deprecated
|
||||
removed: boolean = false
|
||||
|
||||
/**
|
||||
* The dataset deprecation status
|
||||
*/
|
||||
deprecation: optional DatasetDeprecation
|
||||
|
||||
/**
|
||||
* Institutional memory metadata of the dataset
|
||||
*/
|
||||
institutionalMemory: optional InstitutionalMemory
|
||||
|
||||
/**
|
||||
* Ownership metadata of the dataset
|
||||
*/
|
||||
ownership: optional Ownership
|
||||
|
||||
/**
|
||||
* Schema metadata of the dataset
|
||||
*/
|
||||
schemaMetadata: optional SchemaMetadata
|
||||
|
||||
/**
|
||||
* Status metadata of the dataset
|
||||
*/
|
||||
status: optional Status
|
||||
|
||||
/**
|
||||
* Upstream lineage metadata of the dataset
|
||||
*/
|
||||
upstreamLineage: optional UpstreamLineage
|
||||
|
||||
}
|
||||
@ -356,7 +356,8 @@
|
||||
"name" : "removed",
|
||||
"type" : "boolean",
|
||||
"doc" : "whether the dataset is removed or not",
|
||||
"default" : false
|
||||
"default" : false,
|
||||
"deprecated" : true
|
||||
}, {
|
||||
"name" : "deprecation",
|
||||
"type" : {
|
||||
@ -385,19 +386,408 @@
|
||||
},
|
||||
"doc" : "The dataset deprecation status",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "institutionalMemory",
|
||||
"type" : "com.linkedin.common.InstitutionalMemory",
|
||||
"doc" : "Institutional memory metadata of the dataset",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "ownership",
|
||||
"type" : "com.linkedin.common.Ownership",
|
||||
"doc" : "Ownership metadata of the dataset",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "schemaMetadata",
|
||||
"type" : {
|
||||
"type" : "record",
|
||||
"name" : "SchemaMetadata",
|
||||
"namespace" : "com.linkedin.schema",
|
||||
"doc" : "SchemaMetadata to describe metadata related to store schema",
|
||||
"include" : [ {
|
||||
"type" : "record",
|
||||
"name" : "SchemaMetadataKey",
|
||||
"doc" : "Key to retrieve schema metadata.",
|
||||
"fields" : [ {
|
||||
"name" : "schemaName",
|
||||
"type" : "string",
|
||||
"doc" : "Schema name e.g. PageViewEvent, identity.Profile, ams.account_management_tracking",
|
||||
"validate" : {
|
||||
"strlen" : {
|
||||
"max" : 500,
|
||||
"min" : 1
|
||||
}
|
||||
}
|
||||
}, {
|
||||
"name" : "platform",
|
||||
"type" : "com.linkedin.common.DataPlatformUrn",
|
||||
"doc" : "Standardized platform urn where schema is defined. The data platform Urn (urn:li:platform:{platform_name})"
|
||||
}, {
|
||||
"name" : "version",
|
||||
"type" : "long",
|
||||
"doc" : "Every change to SchemaMetadata in the resource results in a new version. Version is server assigned. This version is differ from platform native schema version."
|
||||
} ]
|
||||
}, "com.linkedin.common.ChangeAuditStamps" ],
|
||||
"fields" : [ {
|
||||
"name" : "dataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "Dataset this schema metadata is associated with.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "cluster",
|
||||
"type" : "string",
|
||||
"doc" : "The cluster this schema metadata resides from",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "hash",
|
||||
"type" : "string",
|
||||
"doc" : "the SHA1 hash of the schema content"
|
||||
}, {
|
||||
"name" : "platformSchema",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "EspressoSchema",
|
||||
"doc" : "Schema text of an espresso table schema.",
|
||||
"fields" : [ {
|
||||
"name" : "documentSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native espresso document schema."
|
||||
}, {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The espresso table schema definition."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OracleDDL",
|
||||
"doc" : "Schema holder for oracle data definition language that describes an oracle table.",
|
||||
"fields" : [ {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform. This is a human readable (json blob) table schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "MySqlDDL",
|
||||
"doc" : "Schema holder for MySql data definition language that describes an MySql table.",
|
||||
"fields" : [ {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform. This is a human readable (json blob) table schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "PrestoDDL",
|
||||
"doc" : "Schema holder for presto data definition language that describes a presto view.",
|
||||
"fields" : [ {
|
||||
"name" : "rawSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema in the dataset's platform. This includes the DDL and the columns extracted from DDL."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "KafkaSchema",
|
||||
"doc" : "Schema holder for kafka schema.",
|
||||
"fields" : [ {
|
||||
"name" : "documentSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native kafka document schema. This is a human readable avro document schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "BinaryJsonSchema",
|
||||
"doc" : "Schema text of binary JSON schema.",
|
||||
"fields" : [ {
|
||||
"name" : "schema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema text for binary JSON file format."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OrcSchema",
|
||||
"doc" : "Schema text of an ORC schema.",
|
||||
"fields" : [ {
|
||||
"name" : "schema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema for ORC file format."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "Schemaless",
|
||||
"doc" : "The dataset has no specific schema associated with it",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "KeyValueSchema",
|
||||
"doc" : "Schema text of a key-value store schema.",
|
||||
"fields" : [ {
|
||||
"name" : "keySchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema for the key in the key-value store."
|
||||
}, {
|
||||
"name" : "valueSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema for the value in the key-value store."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OtherSchema",
|
||||
"doc" : "Schema holder for undefined schema types.",
|
||||
"fields" : [ {
|
||||
"name" : "rawSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform."
|
||||
} ]
|
||||
} ],
|
||||
"doc" : "The native schema in the dataset's platform."
|
||||
}, {
|
||||
"name" : "fields",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : {
|
||||
"type" : "record",
|
||||
"name" : "SchemaField",
|
||||
"doc" : "SchemaField to describe metadata related to dataset schema. Schema normalization rules: http://go/tms-schema",
|
||||
"fields" : [ {
|
||||
"name" : "fieldPath",
|
||||
"type" : {
|
||||
"type" : "typeref",
|
||||
"name" : "SchemaFieldPath",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
"doc" : "Schema field path as described by schema normalizations rules: http://go/tms-schema",
|
||||
"ref" : "string"
|
||||
},
|
||||
"doc" : "Flattened name of the field. Field is computed from jsonPath field. For data translation rules refer to wiki page above."
|
||||
}, {
|
||||
"name" : "jsonPath",
|
||||
"type" : "string",
|
||||
"doc" : "Flattened name of a field in JSON Path notation.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "nullable",
|
||||
"type" : "boolean",
|
||||
"doc" : "Indicates if this field is optional or nullable",
|
||||
"default" : false
|
||||
}, {
|
||||
"name" : "description",
|
||||
"type" : "string",
|
||||
"doc" : "Description",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "type",
|
||||
"type" : {
|
||||
"type" : "record",
|
||||
"name" : "SchemaFieldDataType",
|
||||
"doc" : "Schema field data types",
|
||||
"fields" : [ {
|
||||
"name" : "type",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "BooleanType",
|
||||
"doc" : "Boolean field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "FixedType",
|
||||
"doc" : "Fixed field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "StringType",
|
||||
"doc" : "String field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "BytesType",
|
||||
"doc" : "Bytes field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "NumberType",
|
||||
"doc" : "Number data type: long, integer, short, etc..",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "EnumType",
|
||||
"doc" : "Enum field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "NullType",
|
||||
"doc" : "Null field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "MapType",
|
||||
"doc" : "Map field type.",
|
||||
"fields" : [ {
|
||||
"name" : "keyType",
|
||||
"type" : "string",
|
||||
"doc" : "Key type in a map",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "valueType",
|
||||
"type" : "string",
|
||||
"doc" : "Type of the value in a map",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "ArrayType",
|
||||
"doc" : "Array field type.",
|
||||
"fields" : [ {
|
||||
"name" : "nestedType",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "string"
|
||||
},
|
||||
"doc" : "List of types this array holds.",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UnionType",
|
||||
"doc" : "Union field type.",
|
||||
"fields" : [ {
|
||||
"name" : "nestedTypes",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "string"
|
||||
},
|
||||
"doc" : "List of types in union type.",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "RecordType",
|
||||
"doc" : "Record field type.",
|
||||
"fields" : [ ]
|
||||
} ],
|
||||
"doc" : "Data platform specific types"
|
||||
} ]
|
||||
},
|
||||
"doc" : "Platform independent field type of the field."
|
||||
}, {
|
||||
"name" : "nativeDataType",
|
||||
"type" : "string",
|
||||
"doc" : "The native type of the field in the dataset's platform as declared by platform schema."
|
||||
}, {
|
||||
"name" : "recursive",
|
||||
"type" : "boolean",
|
||||
"doc" : "There are use cases when a field in type B references type A. A field in A references field of type B. In such cases, we will mark the first field as recursive.",
|
||||
"default" : false
|
||||
} ]
|
||||
}
|
||||
},
|
||||
"doc" : "Client provided a list of fields from document schema."
|
||||
}, {
|
||||
"name" : "primaryKeys",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "com.linkedin.dataset.SchemaFieldPath"
|
||||
},
|
||||
"doc" : "Client provided list of fields that define primary keys to access record. Field order defines hierarchical espresso keys. Empty lists indicates absence of primary key access patter. Value is a SchemaField@fieldPath.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "foreignKeysSpecs",
|
||||
"type" : {
|
||||
"type" : "map",
|
||||
"values" : {
|
||||
"type" : "record",
|
||||
"name" : "ForeignKeySpec",
|
||||
"doc" : "Description of a foreign key in a schema.",
|
||||
"fields" : [ {
|
||||
"name" : "foreignKey",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "DatasetFieldForeignKey",
|
||||
"doc" : "For non-urn based foregin keys.",
|
||||
"fields" : [ {
|
||||
"name" : "parentDataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "dataset that stores the resource."
|
||||
}, {
|
||||
"name" : "currentFieldPaths",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "com.linkedin.dataset.SchemaFieldPath"
|
||||
},
|
||||
"doc" : "List of fields in hosting(current) SchemaMetadata that conform a foreign key. List can contain a single entry or multiple entries if several entries in hosting schema conform a foreign key in a single parent dataset."
|
||||
}, {
|
||||
"name" : "parentField",
|
||||
"type" : "com.linkedin.dataset.SchemaFieldPath",
|
||||
"doc" : "SchemaField@fieldPath that uniquely identify field in parent dataset that this field references."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UrnForeignKey",
|
||||
"doc" : "If SchemaMetadata fields make any external references and references are of type com.linkeidn.common.Urn or any children, this models can be used to mark it.",
|
||||
"fields" : [ {
|
||||
"name" : "currentFieldPath",
|
||||
"type" : "com.linkedin.dataset.SchemaFieldPath",
|
||||
"doc" : "Field in hosting(current) SchemaMetadata."
|
||||
} ]
|
||||
} ],
|
||||
"doc" : "Foreign key definition in metadata schema."
|
||||
} ]
|
||||
}
|
||||
},
|
||||
"doc" : "Map captures all the references schema makes to external datasets. Map key is ForeignKeySpecName typeref.",
|
||||
"optional" : true
|
||||
} ]
|
||||
},
|
||||
"doc" : "Schema metadata of the dataset",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "status",
|
||||
"type" : "com.linkedin.common.Status",
|
||||
"doc" : "Status metadata of the dataset",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "upstreamLineage",
|
||||
"type" : {
|
||||
"type" : "record",
|
||||
"name" : "UpstreamLineage",
|
||||
"doc" : "Upstream lineage of a dataset",
|
||||
"fields" : [ {
|
||||
"name" : "upstreams",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : {
|
||||
"type" : "record",
|
||||
"name" : "Upstream",
|
||||
"doc" : "Upstream lineage information about a dataset including the source reporting the lineage",
|
||||
"fields" : [ {
|
||||
"name" : "auditStamp",
|
||||
"type" : "com.linkedin.common.AuditStamp",
|
||||
"doc" : "Audit stamp containing who reported the lineage and when"
|
||||
}, {
|
||||
"name" : "dataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "The upstream dataset the lineage points to"
|
||||
}, {
|
||||
"name" : "type",
|
||||
"type" : {
|
||||
"type" : "enum",
|
||||
"name" : "DatasetLineageType",
|
||||
"doc" : "The various types of supported dataset lineage",
|
||||
"symbols" : [ "COPY", "TRANSFORMED", "VIEW" ],
|
||||
"symbolDocs" : {
|
||||
"COPY" : "Direct copy without modification",
|
||||
"TRANSFORMED" : "Transformed data with modification (format or content change)",
|
||||
"VIEW" : "Represents a view defined on the sources e.g. Hive view defined on underlying hive tables or a Hive table pointing to a HDFS dataset or DALI view defined on multiple sources"
|
||||
}
|
||||
},
|
||||
"doc" : "The type of the lineage"
|
||||
} ]
|
||||
}
|
||||
},
|
||||
"doc" : "List of upstream dataset lineage information"
|
||||
} ]
|
||||
},
|
||||
"doc" : "Upstream lineage metadata of the dataset",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, "com.linkedin.dataset.DatasetDeprecation", "com.linkedin.dataset.DatasetKey", {
|
||||
"type" : "enum",
|
||||
"name" : "DatasetLineageType",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
"doc" : "The various types of supported dataset lineage",
|
||||
"symbols" : [ "COPY", "TRANSFORMED", "VIEW" ],
|
||||
"symbolDocs" : {
|
||||
"COPY" : "Direct copy without modification",
|
||||
"TRANSFORMED" : "Transformed data with modification (format or content change)",
|
||||
"VIEW" : "Represents a view defined on the sources e.g. Hive view defined on underlying hive tables or a Hive table pointing to a HDFS dataset or DALI view defined on multiple sources"
|
||||
}
|
||||
}, {
|
||||
}, "com.linkedin.dataset.DatasetDeprecation", "com.linkedin.dataset.DatasetKey", "com.linkedin.dataset.DatasetLineageType", {
|
||||
"type" : "record",
|
||||
"name" : "DatasetProperties",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
@ -460,44 +850,7 @@
|
||||
},
|
||||
"doc" : "List of downstream dataset lineage information"
|
||||
} ]
|
||||
}, "com.linkedin.dataset.PlatformNativeType", {
|
||||
"type" : "typeref",
|
||||
"name" : "SchemaFieldPath",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
"doc" : "Schema field path as described by schema normalizations rules: http://go/tms-schema",
|
||||
"ref" : "string"
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "Upstream",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
"doc" : "Upstream lineage information about a dataset including the source reporting the lineage",
|
||||
"fields" : [ {
|
||||
"name" : "auditStamp",
|
||||
"type" : "com.linkedin.common.AuditStamp",
|
||||
"doc" : "Audit stamp containing who reported the lineage and when"
|
||||
}, {
|
||||
"name" : "dataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "The upstream dataset the lineage points to"
|
||||
}, {
|
||||
"name" : "type",
|
||||
"type" : "DatasetLineageType",
|
||||
"doc" : "The type of the lineage"
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UpstreamLineage",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
"doc" : "Upstream lineage of a dataset",
|
||||
"fields" : [ {
|
||||
"name" : "upstreams",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "Upstream"
|
||||
},
|
||||
"doc" : "List of upstream dataset lineage information"
|
||||
} ]
|
||||
}, {
|
||||
}, "com.linkedin.dataset.PlatformNativeType", "com.linkedin.dataset.SchemaFieldPath", "com.linkedin.dataset.Upstream", "com.linkedin.dataset.UpstreamLineage", {
|
||||
"type" : "record",
|
||||
"name" : "UpstreamLineageDelta",
|
||||
"namespace" : "com.linkedin.dataset",
|
||||
@ -515,337 +868,7 @@
|
||||
"name" : "DatasetAspect",
|
||||
"namespace" : "com.linkedin.metadata.aspect",
|
||||
"doc" : "A union of all supported metadata aspects for a Dataset",
|
||||
"ref" : [ "com.linkedin.dataset.DatasetProperties", "com.linkedin.dataset.DatasetDeprecation", "com.linkedin.dataset.UpstreamLineage", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Ownership", "com.linkedin.common.Status", {
|
||||
"type" : "record",
|
||||
"name" : "SchemaMetadata",
|
||||
"namespace" : "com.linkedin.schema",
|
||||
"doc" : "SchemaMetadata to describe metadata related to store schema",
|
||||
"include" : [ {
|
||||
"type" : "record",
|
||||
"name" : "SchemaMetadataKey",
|
||||
"doc" : "Key to retrieve schema metadata.",
|
||||
"fields" : [ {
|
||||
"name" : "schemaName",
|
||||
"type" : "string",
|
||||
"doc" : "Schema name e.g. PageViewEvent, identity.Profile, ams.account_management_tracking",
|
||||
"validate" : {
|
||||
"strlen" : {
|
||||
"max" : 500,
|
||||
"min" : 1
|
||||
}
|
||||
}
|
||||
}, {
|
||||
"name" : "platform",
|
||||
"type" : "com.linkedin.common.DataPlatformUrn",
|
||||
"doc" : "Standardized platform urn where schema is defined. The data platform Urn (urn:li:platform:{platform_name})"
|
||||
}, {
|
||||
"name" : "version",
|
||||
"type" : "long",
|
||||
"doc" : "Every change to SchemaMetadata in the resource results in a new version. Version is server assigned. This version is differ from platform native schema version."
|
||||
} ]
|
||||
}, "com.linkedin.common.ChangeAuditStamps" ],
|
||||
"fields" : [ {
|
||||
"name" : "dataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "Dataset this schema metadata is associated with.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "cluster",
|
||||
"type" : "string",
|
||||
"doc" : "The cluster this schema metadata resides from",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "hash",
|
||||
"type" : "string",
|
||||
"doc" : "the SHA1 hash of the schema content"
|
||||
}, {
|
||||
"name" : "platformSchema",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "EspressoSchema",
|
||||
"doc" : "Schema text of an espresso table schema.",
|
||||
"fields" : [ {
|
||||
"name" : "documentSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native espresso document schema."
|
||||
}, {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The espresso table schema definition."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OracleDDL",
|
||||
"doc" : "Schema holder for oracle data definition language that describes an oracle table.",
|
||||
"fields" : [ {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform. This is a human readable (json blob) table schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "MySqlDDL",
|
||||
"doc" : "Schema holder for MySql data definition language that describes an MySql table.",
|
||||
"fields" : [ {
|
||||
"name" : "tableSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform. This is a human readable (json blob) table schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "PrestoDDL",
|
||||
"doc" : "Schema holder for presto data definition language that describes a presto view.",
|
||||
"fields" : [ {
|
||||
"name" : "rawSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema in the dataset's platform. This includes the DDL and the columns extracted from DDL."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "KafkaSchema",
|
||||
"doc" : "Schema holder for kafka schema.",
|
||||
"fields" : [ {
|
||||
"name" : "documentSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native kafka document schema. This is a human readable avro document schema."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "BinaryJsonSchema",
|
||||
"doc" : "Schema text of binary JSON schema.",
|
||||
"fields" : [ {
|
||||
"name" : "schema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema text for binary JSON file format."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OrcSchema",
|
||||
"doc" : "Schema text of an ORC schema.",
|
||||
"fields" : [ {
|
||||
"name" : "schema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema for ORC file format."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "Schemaless",
|
||||
"doc" : "The dataset has no specific schema associated with it",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "KeyValueSchema",
|
||||
"doc" : "Schema text of a key-value store schema.",
|
||||
"fields" : [ {
|
||||
"name" : "keySchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema for the key in the key-value store."
|
||||
}, {
|
||||
"name" : "valueSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The raw schema for the value in the key-value store."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "OtherSchema",
|
||||
"doc" : "Schema holder for undefined schema types.",
|
||||
"fields" : [ {
|
||||
"name" : "rawSchema",
|
||||
"type" : "string",
|
||||
"doc" : "The native schema in the dataset's platform."
|
||||
} ]
|
||||
} ],
|
||||
"doc" : "The native schema in the dataset's platform."
|
||||
}, {
|
||||
"name" : "fields",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : {
|
||||
"type" : "record",
|
||||
"name" : "SchemaField",
|
||||
"doc" : "SchemaField to describe metadata related to dataset schema. Schema normalization rules: http://go/tms-schema",
|
||||
"fields" : [ {
|
||||
"name" : "fieldPath",
|
||||
"type" : "com.linkedin.dataset.SchemaFieldPath",
|
||||
"doc" : "Flattened name of the field. Field is computed from jsonPath field. For data translation rules refer to wiki page above."
|
||||
}, {
|
||||
"name" : "jsonPath",
|
||||
"type" : "string",
|
||||
"doc" : "Flattened name of a field in JSON Path notation.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "nullable",
|
||||
"type" : "boolean",
|
||||
"doc" : "Indicates if this field is optional or nullable",
|
||||
"default" : false
|
||||
}, {
|
||||
"name" : "description",
|
||||
"type" : "string",
|
||||
"doc" : "Description",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "type",
|
||||
"type" : {
|
||||
"type" : "record",
|
||||
"name" : "SchemaFieldDataType",
|
||||
"doc" : "Schema field data types",
|
||||
"fields" : [ {
|
||||
"name" : "type",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "BooleanType",
|
||||
"doc" : "Boolean field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "FixedType",
|
||||
"doc" : "Fixed field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "StringType",
|
||||
"doc" : "String field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "BytesType",
|
||||
"doc" : "Bytes field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "NumberType",
|
||||
"doc" : "Number data type: long, integer, short, etc..",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "EnumType",
|
||||
"doc" : "Enum field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "NullType",
|
||||
"doc" : "Null field type.",
|
||||
"fields" : [ ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "MapType",
|
||||
"doc" : "Map field type.",
|
||||
"fields" : [ {
|
||||
"name" : "keyType",
|
||||
"type" : "string",
|
||||
"doc" : "Key type in a map",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "valueType",
|
||||
"type" : "string",
|
||||
"doc" : "Type of the value in a map",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "ArrayType",
|
||||
"doc" : "Array field type.",
|
||||
"fields" : [ {
|
||||
"name" : "nestedType",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "string"
|
||||
},
|
||||
"doc" : "List of types this array holds.",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UnionType",
|
||||
"doc" : "Union field type.",
|
||||
"fields" : [ {
|
||||
"name" : "nestedTypes",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "string"
|
||||
},
|
||||
"doc" : "List of types in union type.",
|
||||
"optional" : true
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "RecordType",
|
||||
"doc" : "Record field type.",
|
||||
"fields" : [ ]
|
||||
} ],
|
||||
"doc" : "Data platform specific types"
|
||||
} ]
|
||||
},
|
||||
"doc" : "Platform independent field type of the field."
|
||||
}, {
|
||||
"name" : "nativeDataType",
|
||||
"type" : "string",
|
||||
"doc" : "The native type of the field in the dataset's platform as declared by platform schema."
|
||||
}, {
|
||||
"name" : "recursive",
|
||||
"type" : "boolean",
|
||||
"doc" : "There are use cases when a field in type B references type A. A field in A references field of type B. In such cases, we will mark the first field as recursive.",
|
||||
"default" : false
|
||||
} ]
|
||||
}
|
||||
},
|
||||
"doc" : "Client provided a list of fields from document schema."
|
||||
}, {
|
||||
"name" : "primaryKeys",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "com.linkedin.dataset.SchemaFieldPath"
|
||||
},
|
||||
"doc" : "Client provided list of fields that define primary keys to access record. Field order defines hierarchical espresso keys. Empty lists indicates absence of primary key access patter. Value is a SchemaField@fieldPath.",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "foreignKeysSpecs",
|
||||
"type" : {
|
||||
"type" : "map",
|
||||
"values" : {
|
||||
"type" : "record",
|
||||
"name" : "ForeignKeySpec",
|
||||
"doc" : "Description of a foreign key in a schema.",
|
||||
"fields" : [ {
|
||||
"name" : "foreignKey",
|
||||
"type" : [ {
|
||||
"type" : "record",
|
||||
"name" : "DatasetFieldForeignKey",
|
||||
"doc" : "For non-urn based foregin keys.",
|
||||
"fields" : [ {
|
||||
"name" : "parentDataset",
|
||||
"type" : "com.linkedin.common.DatasetUrn",
|
||||
"doc" : "dataset that stores the resource."
|
||||
}, {
|
||||
"name" : "currentFieldPaths",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "com.linkedin.dataset.SchemaFieldPath"
|
||||
},
|
||||
"doc" : "List of fields in hosting(current) SchemaMetadata that conform a foreign key. List can contain a single entry or multiple entries if several entries in hosting schema conform a foreign key in a single parent dataset."
|
||||
}, {
|
||||
"name" : "parentField",
|
||||
"type" : "com.linkedin.dataset.SchemaFieldPath",
|
||||
"doc" : "SchemaField@fieldPath that uniquely identify field in parent dataset that this field references."
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UrnForeignKey",
|
||||
"doc" : "If SchemaMetadata fields make any external references and references are of type com.linkeidn.common.Urn or any children, this models can be used to mark it.",
|
||||
"fields" : [ {
|
||||
"name" : "currentFieldPath",
|
||||
"type" : "com.linkedin.dataset.SchemaFieldPath",
|
||||
"doc" : "Field in hosting(current) SchemaMetadata."
|
||||
} ]
|
||||
} ],
|
||||
"doc" : "Foreign key definition in metadata schema."
|
||||
} ]
|
||||
}
|
||||
},
|
||||
"doc" : "Map captures all the references schema makes to external datasets. Map key is ForeignKeySpecName typeref.",
|
||||
"optional" : true
|
||||
} ]
|
||||
} ]
|
||||
"ref" : [ "com.linkedin.dataset.DatasetProperties", "com.linkedin.dataset.DatasetDeprecation", "com.linkedin.dataset.UpstreamLineage", "com.linkedin.common.InstitutionalMemory", "com.linkedin.common.Ownership", "com.linkedin.common.Status", "com.linkedin.schema.SchemaMetadata" ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "AggregationMetadata",
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.linkedin.metadata.resources.dataset;
|
||||
|
||||
import com.linkedin.common.InstitutionalMemory;
|
||||
import com.linkedin.common.Ownership;
|
||||
import com.linkedin.common.Status;
|
||||
import com.linkedin.common.urn.DatasetUrn;
|
||||
import com.linkedin.common.urn.Urn;
|
||||
@ -8,6 +10,7 @@ import com.linkedin.dataset.Dataset;
|
||||
import com.linkedin.dataset.DatasetDeprecation;
|
||||
import com.linkedin.dataset.DatasetKey;
|
||||
import com.linkedin.dataset.DatasetProperties;
|
||||
import com.linkedin.dataset.UpstreamLineage;
|
||||
import com.linkedin.metadata.aspect.DatasetAspect;
|
||||
import com.linkedin.metadata.dao.BaseBrowseDAO;
|
||||
import com.linkedin.metadata.dao.BaseLocalDAO;
|
||||
@ -35,6 +38,7 @@ import com.linkedin.restli.server.annotations.PagingContextParam;
|
||||
import com.linkedin.restli.server.annotations.QueryParam;
|
||||
import com.linkedin.restli.server.annotations.RestLiCollection;
|
||||
import com.linkedin.restli.server.annotations.RestMethod;
|
||||
import com.linkedin.schema.SchemaMetadata;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -123,19 +127,28 @@ public final class Datasets extends BaseBrowsableEntityResource<
|
||||
|
||||
ModelUtils.getAspectsFromSnapshot(snapshot).forEach(aspect -> {
|
||||
if (aspect instanceof DatasetProperties) {
|
||||
DatasetProperties datasetProperties = DatasetProperties.class.cast(aspect);
|
||||
final DatasetProperties datasetProperties = (DatasetProperties) aspect;
|
||||
value.setProperties(datasetProperties.getCustomProperties());
|
||||
value.setTags(datasetProperties.getTags());
|
||||
if (datasetProperties.hasUri()) {
|
||||
if (datasetProperties.getUri() != null) {
|
||||
value.setUri(datasetProperties.getUri());
|
||||
}
|
||||
if (datasetProperties.hasDescription()) {
|
||||
if (datasetProperties.getDescription() != null) {
|
||||
value.setDescription(datasetProperties.getDescription());
|
||||
}
|
||||
} else if (aspect instanceof DatasetDeprecation) {
|
||||
value.setDeprecation(DatasetDeprecation.class.cast(aspect));
|
||||
value.setDeprecation((DatasetDeprecation) aspect);
|
||||
} else if (aspect instanceof InstitutionalMemory) {
|
||||
value.setInstitutionalMemory((InstitutionalMemory) aspect);
|
||||
} else if (aspect instanceof Ownership) {
|
||||
value.setOwnership((Ownership) aspect);
|
||||
} else if (aspect instanceof SchemaMetadata) {
|
||||
value.setSchemaMetadata((SchemaMetadata) aspect);
|
||||
} else if (aspect instanceof Status) {
|
||||
value.setRemoved(Status.class.cast(aspect).isRemoved());
|
||||
value.setStatus((Status) aspect);
|
||||
value.setRemoved(((Status) aspect).isRemoved());
|
||||
} else if (aspect instanceof UpstreamLineage) {
|
||||
value.setUpstreamLineage((UpstreamLineage) aspect);
|
||||
}
|
||||
});
|
||||
return value;
|
||||
@ -145,14 +158,30 @@ public final class Datasets extends BaseBrowsableEntityResource<
|
||||
@Nonnull
|
||||
protected DatasetSnapshot toSnapshot(@Nonnull Dataset dataset, @Nonnull DatasetUrn datasetUrn) {
|
||||
final List<DatasetAspect> aspects = new ArrayList<>();
|
||||
if (dataset.hasProperties()) {
|
||||
if (dataset.getProperties() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, getDatasetPropertiesAspect(dataset)));
|
||||
}
|
||||
if (dataset.hasDeprecation()) {
|
||||
if (dataset.getDeprecation() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getDeprecation()));
|
||||
}
|
||||
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, new Status().setRemoved(dataset.isRemoved())));
|
||||
if (dataset.getInstitutionalMemory() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getInstitutionalMemory()));
|
||||
}
|
||||
if (dataset.getOwnership() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getOwnership()));
|
||||
}
|
||||
if (dataset.getSchemaMetadata() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getSchemaMetadata()));
|
||||
}
|
||||
if (dataset.getStatus() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getStatus()));
|
||||
}
|
||||
if (dataset.getUpstreamLineage() != null) {
|
||||
aspects.add(ModelUtils.newAspectUnion(DatasetAspect.class, dataset.getUpstreamLineage()));
|
||||
}
|
||||
if (dataset.hasRemoved()) {
|
||||
aspects.add(DatasetAspect.create(new Status().setRemoved(dataset.isRemoved())));
|
||||
}
|
||||
return ModelUtils.newSnapshot(DatasetSnapshot.class, datasetUrn, aspects);
|
||||
}
|
||||
|
||||
@ -161,10 +190,10 @@ public final class Datasets extends BaseBrowsableEntityResource<
|
||||
final DatasetProperties datasetProperties = new DatasetProperties();
|
||||
datasetProperties.setDescription(dataset.getDescription());
|
||||
datasetProperties.setTags(dataset.getTags());
|
||||
if (dataset.hasUri()) {
|
||||
if (dataset.getUri() != null) {
|
||||
datasetProperties.setUri(dataset.getUri());
|
||||
}
|
||||
if (dataset.hasPlatform()) {
|
||||
if (dataset.getProperties() != null) {
|
||||
datasetProperties.setCustomProperties(dataset.getProperties());
|
||||
}
|
||||
return datasetProperties;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user