mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-06 08:38:41 +00:00
161 lines
4.9 KiB
Python
161 lines
4.9 KiB
Python
![]() |
"""
|
||
|
Generates data for performance testing of warehouse sources.
|
||
|
|
||
|
In the future, we could try to create a more realistic dataset
|
||
|
by anonymizing and reduplicating a production datahub instance's data.
|
||
|
We could also get more human data by using Faker.
|
||
|
|
||
|
This is a work in progress, built piecemeal as needed.
|
||
|
"""
|
||
|
import random
|
||
|
import uuid
|
||
|
from dataclasses import dataclass
|
||
|
from datetime import datetime, timedelta, timezone
|
||
|
from typing import Iterable, List, TypeVar
|
||
|
|
||
|
from tests.performance.data_model import (
|
||
|
Container,
|
||
|
FieldAccess,
|
||
|
Query,
|
||
|
StatementType,
|
||
|
Table,
|
||
|
View,
|
||
|
)
|
||
|
|
||
|
T = TypeVar("T")
|
||
|
|
||
|
OPERATION_TYPES: List[StatementType] = [
|
||
|
"INSERT",
|
||
|
"UPDATE",
|
||
|
"DELETE",
|
||
|
"CREATE",
|
||
|
"ALTER",
|
||
|
"DROP",
|
||
|
"CUSTOM",
|
||
|
"UNKNOWN",
|
||
|
]
|
||
|
|
||
|
|
||
|
@dataclass(frozen=True)
|
||
|
class NormalDistribution:
|
||
|
mu: float
|
||
|
sigma: float
|
||
|
|
||
|
def sample(self) -> int:
|
||
|
return int(random.gauss(mu=self.mu, sigma=self.sigma))
|
||
|
|
||
|
def sample_with_floor(self, floor: int = 1) -> int:
|
||
|
return max(int(random.gauss(mu=self.mu, sigma=self.sigma)), floor)
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class SeedMetadata:
|
||
|
containers: List[Container]
|
||
|
tables: List[Table]
|
||
|
views: List[View]
|
||
|
start_time: datetime
|
||
|
end_time: datetime
|
||
|
|
||
|
|
||
|
def generate_data(
|
||
|
num_containers: int,
|
||
|
num_tables: int,
|
||
|
num_views: int,
|
||
|
columns_per_table: NormalDistribution = NormalDistribution(5, 2),
|
||
|
parents_per_view: NormalDistribution = NormalDistribution(2, 1),
|
||
|
view_definition_length: NormalDistribution = NormalDistribution(150, 50),
|
||
|
time_range: timedelta = timedelta(days=14),
|
||
|
) -> SeedMetadata:
|
||
|
containers = [Container(f"container-{i}") for i in range(num_containers)]
|
||
|
tables = [
|
||
|
Table(
|
||
|
f"table-{i}",
|
||
|
container=random.choice(containers),
|
||
|
columns=[
|
||
|
f"column-{j}-{uuid.uuid4()}"
|
||
|
for j in range(columns_per_table.sample_with_floor())
|
||
|
],
|
||
|
)
|
||
|
for i in range(num_tables)
|
||
|
]
|
||
|
views = [
|
||
|
View(
|
||
|
f"view-{i}",
|
||
|
container=random.choice(containers),
|
||
|
columns=[
|
||
|
f"column-{j}-{uuid.uuid4()}"
|
||
|
for j in range(columns_per_table.sample_with_floor())
|
||
|
],
|
||
|
definition=f"{uuid.uuid4()}-{'*' * view_definition_length.sample_with_floor(10)}",
|
||
|
parents=random.sample(tables, parents_per_view.sample_with_floor()),
|
||
|
)
|
||
|
for i in range(num_views)
|
||
|
]
|
||
|
|
||
|
now = datetime.now(tz=timezone.utc)
|
||
|
return SeedMetadata(
|
||
|
containers=containers,
|
||
|
tables=tables,
|
||
|
views=views,
|
||
|
start_time=now - time_range,
|
||
|
end_time=now,
|
||
|
)
|
||
|
|
||
|
|
||
|
def generate_queries(
|
||
|
seed_metadata: SeedMetadata,
|
||
|
num_selects: int,
|
||
|
num_operations: int,
|
||
|
num_users: int,
|
||
|
tables_per_select: NormalDistribution = NormalDistribution(3, 5),
|
||
|
columns_per_select: NormalDistribution = NormalDistribution(10, 5),
|
||
|
upstream_tables_per_operation: NormalDistribution = NormalDistribution(2, 2),
|
||
|
query_length: NormalDistribution = NormalDistribution(100, 50),
|
||
|
) -> Iterable[Query]:
|
||
|
all_tables = seed_metadata.tables + seed_metadata.views
|
||
|
users = [f"user-{i}@xyz.com" for i in range(num_users)]
|
||
|
for i in range(num_selects): # Pure SELECT statements
|
||
|
tables = _sample_list(all_tables, tables_per_select)
|
||
|
all_columns = [
|
||
|
FieldAccess(column, table) for table in tables for column in table.columns
|
||
|
]
|
||
|
yield Query(
|
||
|
text=f"{uuid.uuid4()}-{'*' * query_length.sample_with_floor(10)}",
|
||
|
type="SELECT",
|
||
|
actor=random.choice(users),
|
||
|
timestamp=_random_time_between(
|
||
|
seed_metadata.start_time, seed_metadata.end_time
|
||
|
),
|
||
|
fields_accessed=_sample_list(all_columns, columns_per_select),
|
||
|
)
|
||
|
|
||
|
for i in range(num_operations):
|
||
|
modified_table = random.choice(seed_metadata.tables)
|
||
|
n_col = len(modified_table.columns)
|
||
|
num_columns_modified = NormalDistribution(n_col, n_col / 2)
|
||
|
upstream_tables = _sample_list(all_tables, upstream_tables_per_operation)
|
||
|
|
||
|
all_columns = [
|
||
|
FieldAccess(column, table)
|
||
|
for table in upstream_tables
|
||
|
for column in table.columns
|
||
|
]
|
||
|
yield Query(
|
||
|
text=f"{uuid.uuid4()}-{'*' * query_length.sample_with_floor(10)}",
|
||
|
type=random.choice(OPERATION_TYPES),
|
||
|
actor=random.choice(users),
|
||
|
timestamp=_random_time_between(
|
||
|
seed_metadata.start_time, seed_metadata.end_time
|
||
|
),
|
||
|
fields_accessed=_sample_list(all_columns, num_columns_modified),
|
||
|
object_modified=modified_table,
|
||
|
)
|
||
|
|
||
|
|
||
|
def _sample_list(lst: List[T], dist: NormalDistribution) -> List[T]:
|
||
|
return random.sample(lst, min(dist.sample_with_floor(), len(lst)))
|
||
|
|
||
|
|
||
|
def _random_time_between(start: datetime, end: datetime) -> datetime:
|
||
|
return start + timedelta(seconds=(end - start).total_seconds() * random.random())
|