2022-09-11 11:27:46 -07:00
|
|
|
from typing import Iterable
|
|
|
|
|
2024-01-31 14:42:40 +05:30
|
|
|
from datahub.metadata.schema_classes import (
|
|
|
|
DatasetFieldProfileClass,
|
|
|
|
DatasetProfileClass,
|
|
|
|
TimeWindowSizeClass,
|
|
|
|
)
|
2022-09-11 11:27:46 -07:00
|
|
|
from tests.utils import get_timestampmillis_at_start_of_day
|
|
|
|
|
|
|
|
|
|
|
|
def gen_dataset_profiles(
|
|
|
|
num_days: int = 30,
|
|
|
|
) -> Iterable[DatasetProfileClass]:
|
|
|
|
"""
|
|
|
|
Generates `num_days` number of test dataset profiles for the entity
|
|
|
|
represented by the test_dataset_urn, starting from the start time of
|
|
|
|
now - num_days + 1 day to the start of today.
|
|
|
|
"""
|
|
|
|
num_rows: int = 100
|
|
|
|
num_columns: int = 1
|
|
|
|
# [-num_days + 1, -num_days + 2, ..., 0]
|
|
|
|
for relative_day_num in range(-num_days + 1, 1):
|
|
|
|
timestampMillis: int = get_timestampmillis_at_start_of_day(relative_day_num)
|
|
|
|
profile = DatasetProfileClass(
|
|
|
|
timestampMillis=timestampMillis,
|
|
|
|
eventGranularity=TimeWindowSizeClass(unit="DAY", multiple=1),
|
|
|
|
)
|
|
|
|
profile.rowCount = num_rows
|
|
|
|
num_rows += 100
|
|
|
|
profile.columnCount = num_columns
|
|
|
|
profile.fieldProfiles = []
|
|
|
|
field_profile = DatasetFieldProfileClass(fieldPath="test_column")
|
|
|
|
field_profile.uniqueCount = int(num_rows / 2)
|
|
|
|
field_profile.uniqueProportion = float(0.5)
|
|
|
|
field_profile.nullCount = int(num_rows / 10)
|
|
|
|
field_profile.nullProportion = float(0.1)
|
|
|
|
field_profile.min = "10"
|
|
|
|
field_profile.max = "20"
|
|
|
|
field_profile.mean = "15"
|
|
|
|
field_profile.median = "12"
|
|
|
|
field_profile.stdev = "3"
|
|
|
|
profile.fieldProfiles.append(field_profile)
|
|
|
|
yield profile
|