44 lines
1.5 KiB
Python
Raw Permalink Normal View History

from typing import Iterable
from datahub.metadata.schema_classes import (
DatasetFieldProfileClass,
DatasetProfileClass,
TimeWindowSizeClass,
)
from tests.utils import get_timestampmillis_at_start_of_day
def gen_dataset_profiles(
num_days: int = 30,
) -> Iterable[DatasetProfileClass]:
"""
Generates `num_days` number of test dataset profiles for the entity
represented by the test_dataset_urn, starting from the start time of
now - num_days + 1 day to the start of today.
"""
num_rows: int = 100
num_columns: int = 1
# [-num_days + 1, -num_days + 2, ..., 0]
for relative_day_num in range(-num_days + 1, 1):
timestampMillis: int = get_timestampmillis_at_start_of_day(relative_day_num)
profile = DatasetProfileClass(
timestampMillis=timestampMillis,
eventGranularity=TimeWindowSizeClass(unit="DAY", multiple=1),
)
profile.rowCount = num_rows
num_rows += 100
profile.columnCount = num_columns
profile.fieldProfiles = []
field_profile = DatasetFieldProfileClass(fieldPath="test_column")
field_profile.uniqueCount = int(num_rows / 2)
field_profile.uniqueProportion = float(0.5)
field_profile.nullCount = int(num_rows / 10)
field_profile.nullProportion = float(0.1)
field_profile.min = "10"
field_profile.max = "20"
field_profile.mean = "15"
field_profile.median = "12"
field_profile.stdev = "3"
profile.fieldProfiles.append(field_profile)
yield profile