feat(models): Introducing Dataset Partitions Aspect (#10997)

Co-authored-by: John Joyce <john@Johns-MBP.lan>
Co-authored-by: John Joyce <john@ip-192-168-1-200.us-west-2.compute.internal>
This commit is contained in:
John Joyce 2024-07-30 15:24:18 -07:00 committed by GitHub
parent 9321e94247
commit 6f09b96b1d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 60 additions and 10 deletions

View File

@ -0,0 +1,24 @@
namespace com.linkedin.dataset
import com.linkedin.common.AuditStamp
/**
* Defines how the data is partitioned
*/
record PartitionSummary {
/**
* A unique id / value for the partition for which statistics were collected,
* generated by applying the key definition to a given row.
*/
partition: string
/**
* The created time for a given partition.
*/
created: optional AuditStamp
/**
* The last modified / touched time for a given partition.
*/
lastModified: optional AuditStamp
}

View File

@ -0,0 +1,19 @@
namespace com.linkedin.dataset
/**
* Defines how the data is partitioned for Data Lake tables (e.g. Hive, S3, Iceberg, Delta, Hudi, etc).
*/
@Aspect = {
"name": "partitionsSummary"
}
record PartitionsSummary {
/**
* The minimum partition as ordered
*/
minPartition: optional PartitionSummary
/**
* The maximum partition as ordered
*/
maxPartition: optional PartitionSummary
}

View File

@ -150,6 +150,8 @@ record SchemaField {
/**
* For Datasets which are partitioned, this determines the partitioning key.
* Note that multiple columns can be part of a partitioning key, but currently we do not support
* rendering the ordered partitioning key.
*/
isPartitioningKey: optional boolean

View File

@ -1,24 +1,28 @@
namespace com.linkedin.timeseries
/**
* Defines how the data is partitioned
* A reference to a specific partition in a dataset.
*/
record PartitionSpec {
type: enum PartitionType {
FULL_TABLE,
QUERY,
PARTITION
} = "PARTITION"
/**
* String representation of the partition
* A unique id / value for the partition for which statistics were collected,
* generated by applying the key definition to a given row.
*/
@TimeseriesField = {}
partition: string
/**
* Time window of the partition if applicable
* Time window of the partition, if we are able to extract it from the partition key.
*/
timePartition: optional TimeWindow
/**
* Unused!
*/
@deprecated
type: enum PartitionType {
FULL_TABLE,
QUERY,
PARTITION
} = "PARTITION"
}

View File

@ -45,6 +45,7 @@ entities:
- access
- structuredProperties
- forms
- partitionsSummary
- name: dataHubPolicy
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
category: internal