feat(models): Introducing Dataset Partitions Aspect (#10997)

Co-authored-by: John Joyce <john@Johns-MBP.lan> Co-authored-by: John Joyce <john@ip-192-168-1-200.us-west-2.compute.internal>
2025-08-18 14:16:48 +00:00 · 2024-07-30 15:24:18 -07:00 · 2024-07-30 15:24:18 -07:00 · 6f09b96b1d
commit 6f09b96b1d
parent 9321e94247
5 changed files with 60 additions and 10 deletions
--- a/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionSummary.pdl
@ -0,0 +1,24 @@
 namespace com.linkedin.dataset
 import com.linkedin.common.AuditStamp
 /**
 * Defines how the data is partitioned
 */
 record PartitionSummary {
    /**
    * A unique id / value for the partition for which statistics were collected,
    * generated by applying the key definition to a given row.
    */
    partition: string
    /**
     * The created time for a given partition.
     */
    created: optional AuditStamp
    /**
     * The last modified / touched time for a given partition.
     */
    lastModified: optional AuditStamp
 }
--- a/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/PartitionsSummary.pdl
@ -0,0 +1,19 @@
 namespace com.linkedin.dataset
 /**
 * Defines how the data is partitioned for Data Lake tables (e.g. Hive, S3, Iceberg, Delta, Hudi, etc).
 */
@Aspect = {
  "name": "partitionsSummary"
 }
 record PartitionsSummary {
    /**
    * The minimum partition as ordered
    */
    minPartition: optional PartitionSummary
    /**
    * The maximum partition as ordered
    */
    maxPartition: optional PartitionSummary
 }
--- a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl
@ -150,6 +150,8 @@ record SchemaField {
  /**
   * For Datasets which are partitioned, this determines the partitioning key.
   * Note that multiple columns can be part of a partitioning key, but currently we do not support
   * rendering the ordered partitioning key.
   */
  isPartitioningKey: optional boolean
--- a/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/timeseries/PartitionSpec.pdl
@ -1,24 +1,28 @@
 namespace com.linkedin.timeseries
 /**
- * Defines how the data is partitioned
+ * A reference to a specific partition in a dataset.
 */
 record PartitionSpec {
  type: enum PartitionType {
          FULL_TABLE,
          QUERY,
          PARTITION            
      } = "PARTITION"
  /**
-   * String representation of the partition
+   * A unique id / value for the partition for which statistics were collected,
   * generated by applying the key definition to a given row.
   */
  @TimeseriesField = {}
  partition: string
  /**
-   * Time window of the partition if applicable
+   * Time window of the partition, if we are able to extract it from the partition key.
   */
  timePartition: optional TimeWindow
  /**
   * Unused!
   */
  @deprecated
  type: enum PartitionType {
          FULL_TABLE,
          QUERY,
          PARTITION
      } = "PARTITION"
 }
--- a/metadata-models/src/main/resources/entity-registry.yml
+++ b/metadata-models/src/main/resources/entity-registry.yml
@ -45,6 +45,7 @@ entities:
      - access
      - structuredProperties
      - forms
      - partitionsSummary
  - name: dataHubPolicy
    doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
    category: internal