From 6f09b96b1d21bfb6fe38cd17f47a06c99bf9b7dc Mon Sep 17 00:00:00 2001 From: John Joyce Date: Tue, 30 Jul 2024 15:24:18 -0700 Subject: [PATCH] feat(models): Introducing Dataset Partitions Aspect (#10997) Co-authored-by: John Joyce Co-authored-by: John Joyce --- .../com/linkedin/dataset/PartitionSummary.pdl | 24 +++++++++++++++++++ .../linkedin/dataset/PartitionsSummary.pdl | 19 +++++++++++++++ .../com/linkedin/schema/SchemaField.pdl | 2 ++ .../com/linkedin/timeseries/PartitionSpec.pdl | 24 +++++++++++-------- .../src/main/resources/entity-registry.yml | 1 + 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl new file mode 100644 index 0000000000..3984277a31 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl @@ -0,0 +1,24 @@ +namespace com.linkedin.dataset + +import com.linkedin.common.AuditStamp + +/** + * Defines how the data is partitioned + */ +record PartitionSummary { + /** + * A unique id / value for the partition for which statistics were collected, + * generated by applying the key definition to a given row. + */ + partition: string + + /** + * The created time for a given partition. + */ + created: optional AuditStamp + + /** + * The last modified / touched time for a given partition. + */ + lastModified: optional AuditStamp +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl new file mode 100644 index 0000000000..34e696890d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl @@ -0,0 +1,19 @@ +namespace com.linkedin.dataset + +/** + * Defines how the data is partitioned for Data Lake tables (e.g. Hive, S3, Iceberg, Delta, Hudi, etc). + */ +@Aspect = { + "name": "partitionsSummary" +} +record PartitionsSummary { + /** + * The minimum partition as ordered + */ + minPartition: optional PartitionSummary + + /** + * The maximum partition as ordered + */ + maxPartition: optional PartitionSummary +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl index afb0263057..f91e200440 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl @@ -150,6 +150,8 @@ record SchemaField { /** * For Datasets which are partitioned, this determines the partitioning key. + * Note that multiple columns can be part of a partitioning key, but currently we do not support + * rendering the ordered partitioning key. */ isPartitioningKey: optional boolean diff --git a/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl index 084af1513e..146a285e24 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl @@ -1,24 +1,28 @@ namespace com.linkedin.timeseries /** - * Defines how the data is partitioned + * A reference to a specific partition in a dataset. */ record PartitionSpec { - - type: enum PartitionType { - FULL_TABLE, - QUERY, - PARTITION - } = "PARTITION" - /** - * String representation of the partition + * A unique id / value for the partition for which statistics were collected, + * generated by applying the key definition to a given row. */ @TimeseriesField = {} partition: string /** - * Time window of the partition if applicable + * Time window of the partition, if we are able to extract it from the partition key. */ timePartition: optional TimeWindow + + /** + * Unused! + */ + @deprecated + type: enum PartitionType { + FULL_TABLE, + QUERY, + PARTITION + } = "PARTITION" } \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 3af4af5e47..c9f9a851cc 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -45,6 +45,7 @@ entities: - access - structuredProperties - forms + - partitionsSummary - name: dataHubPolicy doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc. category: internal