2023-03-29 17:13:43 -04:00
|
|
|
"""
|
|
|
|
Generates data for performance testing of warehouse sources.
|
|
|
|
|
|
|
|
In the future, we could try to create a more realistic dataset
|
|
|
|
by anonymizing and reduplicating a production datahub instance's data.
|
|
|
|
We could also get more human data by using Faker.
|
|
|
|
|
|
|
|
This is a work in progress, built piecemeal as needed.
|
|
|
|
"""
|
2024-09-11 10:48:57 -07:00
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
import random
|
2023-12-05 12:33:00 -05:00
|
|
|
from abc import ABCMeta, abstractmethod
|
|
|
|
from collections import OrderedDict
|
2023-03-29 17:13:43 -04:00
|
|
|
from dataclasses import dataclass
|
|
|
|
from datetime import datetime, timedelta, timezone
|
2023-12-05 12:33:00 -05:00
|
|
|
from typing import Collection, Iterable, List, Optional, TypeVar, Union, cast
|
2023-03-29 17:13:43 -04:00
|
|
|
|
2023-04-14 21:09:43 -04:00
|
|
|
from faker import Faker
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
from tests.performance.data_model import (
|
2023-10-04 10:23:13 -04:00
|
|
|
Column,
|
|
|
|
ColumnType,
|
2023-03-29 17:13:43 -04:00
|
|
|
Container,
|
|
|
|
FieldAccess,
|
|
|
|
Query,
|
|
|
|
StatementType,
|
|
|
|
Table,
|
|
|
|
View,
|
|
|
|
)
|
|
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
|
|
OPERATION_TYPES: List[StatementType] = [
|
|
|
|
"INSERT",
|
|
|
|
"UPDATE",
|
|
|
|
"DELETE",
|
|
|
|
"CREATE",
|
|
|
|
"ALTER",
|
|
|
|
"DROP",
|
|
|
|
"CUSTOM",
|
|
|
|
"UNKNOWN",
|
|
|
|
]
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
ID_COLUMN = "id" # Use to allow joins between all tables
|
|
|
|
|
|
|
|
|
|
|
|
class Distribution(metaclass=ABCMeta):
|
|
|
|
@abstractmethod
|
|
|
|
def _sample(self) -> int:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def sample(
|
|
|
|
self, *, floor: Optional[int] = None, ceiling: Optional[int] = None
|
|
|
|
) -> int:
|
|
|
|
value = self._sample()
|
|
|
|
if floor is not None:
|
|
|
|
value = max(value, floor)
|
|
|
|
if ceiling is not None:
|
|
|
|
value = min(value, ceiling)
|
|
|
|
return value
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
2023-12-05 12:33:00 -05:00
|
|
|
class NormalDistribution(Distribution):
|
2023-03-29 17:13:43 -04:00
|
|
|
mu: float
|
|
|
|
sigma: float
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
def _sample(self) -> int:
|
2023-03-29 17:13:43 -04:00
|
|
|
return int(random.gauss(mu=self.mu, sigma=self.sigma))
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
class LomaxDistribution(Distribution):
|
|
|
|
"""See https://en.wikipedia.org/wiki/Lomax_distribution.
|
|
|
|
|
|
|
|
Equivalent to pareto(scale, shape) - scale; scale * beta_prime(1, shape)
|
|
|
|
"""
|
|
|
|
|
|
|
|
scale: float
|
|
|
|
shape: float
|
|
|
|
|
|
|
|
def _sample(self) -> int:
|
|
|
|
return int(self.scale * (random.paretovariate(self.shape) - 1))
|
2023-03-29 17:13:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class SeedMetadata:
|
2023-10-04 10:23:13 -04:00
|
|
|
# Each list is a layer of containers, e.g. [[databases], [schemas]]
|
|
|
|
containers: List[List[Container]]
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
tables: List[Table]
|
|
|
|
views: List[View]
|
|
|
|
start_time: datetime
|
|
|
|
end_time: datetime
|
|
|
|
|
2023-10-04 10:23:13 -04:00
|
|
|
@property
|
|
|
|
def all_tables(self) -> List[Table]:
|
|
|
|
return self.tables + cast(List[Table], self.views)
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
|
|
|
|
def generate_data(
|
2023-10-04 10:23:13 -04:00
|
|
|
num_containers: Union[List[int], int],
|
2023-03-29 17:13:43 -04:00
|
|
|
num_tables: int,
|
|
|
|
num_views: int,
|
2023-12-05 12:33:00 -05:00
|
|
|
columns_per_table: Distribution = NormalDistribution(5, 2),
|
|
|
|
parents_per_view: Distribution = NormalDistribution(2, 1),
|
|
|
|
view_definition_length: Distribution = NormalDistribution(150, 50),
|
2023-03-29 17:13:43 -04:00
|
|
|
time_range: timedelta = timedelta(days=14),
|
|
|
|
) -> SeedMetadata:
|
2023-10-04 10:23:13 -04:00
|
|
|
# Assemble containers
|
|
|
|
if isinstance(num_containers, int):
|
|
|
|
num_containers = [num_containers]
|
|
|
|
|
|
|
|
containers: List[List[Container]] = []
|
|
|
|
for i, num_in_layer in enumerate(num_containers):
|
|
|
|
layer = [
|
|
|
|
Container(
|
2023-12-05 12:33:00 -05:00
|
|
|
f"{_container_type(i)}_{j}",
|
2023-10-04 10:23:13 -04:00
|
|
|
parent=random.choice(containers[-1]) if containers else None,
|
|
|
|
)
|
|
|
|
for j in range(num_in_layer)
|
|
|
|
]
|
|
|
|
containers.append(layer)
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
# Assemble tables and views, lineage, and definitions
|
2023-03-29 17:13:43 -04:00
|
|
|
tables = [
|
2023-12-05 12:33:00 -05:00
|
|
|
_generate_table(i, containers[-1], columns_per_table) for i in range(num_tables)
|
2023-03-29 17:13:43 -04:00
|
|
|
]
|
|
|
|
views = [
|
|
|
|
View(
|
2023-12-05 12:33:00 -05:00
|
|
|
**{ # type: ignore
|
|
|
|
**_generate_table(i, containers[-1], columns_per_table).__dict__,
|
|
|
|
"name": f"view_{i}",
|
|
|
|
"definition": f"--{'*' * view_definition_length.sample(floor=0)}",
|
|
|
|
},
|
2023-03-29 17:13:43 -04:00
|
|
|
)
|
|
|
|
for i in range(num_views)
|
|
|
|
]
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
for view in views:
|
|
|
|
view.upstreams = random.sample(tables, k=parents_per_view.sample(floor=1))
|
|
|
|
|
|
|
|
generate_lineage(tables, views)
|
2023-10-04 10:23:13 -04:00
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
now = datetime.now(tz=timezone.utc)
|
|
|
|
return SeedMetadata(
|
|
|
|
containers=containers,
|
|
|
|
tables=tables,
|
|
|
|
views=views,
|
|
|
|
start_time=now - time_range,
|
|
|
|
end_time=now,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
def generate_lineage(
|
|
|
|
tables: Collection[Table],
|
|
|
|
views: Collection[Table],
|
|
|
|
# Percentiles: 75th=0, 80th=1, 95th=2, 99th=4, 99.99th=15
|
|
|
|
upstream_distribution: Distribution = LomaxDistribution(scale=3, shape=5),
|
|
|
|
) -> None:
|
|
|
|
num_upstreams = [upstream_distribution.sample(ceiling=100) for _ in tables]
|
|
|
|
# Prioritize tables with a lot of upstreams themselves
|
|
|
|
factor = 1 + len(tables) // 10
|
|
|
|
table_weights = [1 + (num_upstreams[i] * factor) for i in range(len(tables))]
|
|
|
|
view_weights = [1] * len(views)
|
|
|
|
|
|
|
|
# TODO: Python 3.9 use random.sample with counts
|
|
|
|
sample = []
|
|
|
|
for table, weight in zip(tables, table_weights):
|
|
|
|
for _ in range(weight):
|
|
|
|
sample.append(table)
|
|
|
|
for view, weight in zip(views, view_weights):
|
|
|
|
for _ in range(weight):
|
|
|
|
sample.append(view)
|
|
|
|
for i, table in enumerate(tables):
|
|
|
|
table.upstreams = random.sample( # type: ignore
|
|
|
|
sample,
|
|
|
|
k=num_upstreams[i],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
def generate_queries(
|
|
|
|
seed_metadata: SeedMetadata,
|
|
|
|
num_selects: int,
|
|
|
|
num_operations: int,
|
2023-04-14 21:09:43 -04:00
|
|
|
num_unique_queries: int,
|
2023-03-29 17:13:43 -04:00
|
|
|
num_users: int,
|
|
|
|
tables_per_select: NormalDistribution = NormalDistribution(3, 5),
|
|
|
|
columns_per_select: NormalDistribution = NormalDistribution(10, 5),
|
|
|
|
upstream_tables_per_operation: NormalDistribution = NormalDistribution(2, 2),
|
|
|
|
query_length: NormalDistribution = NormalDistribution(100, 50),
|
|
|
|
) -> Iterable[Query]:
|
2023-04-14 21:09:43 -04:00
|
|
|
faker = Faker()
|
|
|
|
query_texts = [
|
2023-12-05 12:33:00 -05:00
|
|
|
faker.paragraph(query_length.sample(floor=30) // 30)
|
2023-04-14 21:09:43 -04:00
|
|
|
for _ in range(num_unique_queries)
|
|
|
|
]
|
|
|
|
|
2023-03-29 17:13:43 -04:00
|
|
|
all_tables = seed_metadata.tables + seed_metadata.views
|
2023-12-05 12:33:00 -05:00
|
|
|
users = [f"user_{i}@xyz.com" for i in range(num_users)]
|
2025-01-17 21:38:29 +05:30
|
|
|
for _ in range(num_selects): # Pure SELECT statements
|
2023-03-29 17:13:43 -04:00
|
|
|
tables = _sample_list(all_tables, tables_per_select)
|
|
|
|
all_columns = [
|
|
|
|
FieldAccess(column, table) for table in tables for column in table.columns
|
|
|
|
]
|
|
|
|
yield Query(
|
2023-04-14 21:09:43 -04:00
|
|
|
text=random.choice(query_texts),
|
2023-03-29 17:13:43 -04:00
|
|
|
type="SELECT",
|
|
|
|
actor=random.choice(users),
|
|
|
|
timestamp=_random_time_between(
|
|
|
|
seed_metadata.start_time, seed_metadata.end_time
|
|
|
|
),
|
|
|
|
fields_accessed=_sample_list(all_columns, columns_per_select),
|
|
|
|
)
|
|
|
|
|
2025-01-17 21:38:29 +05:30
|
|
|
for _ in range(num_operations):
|
2023-03-29 17:13:43 -04:00
|
|
|
modified_table = random.choice(seed_metadata.tables)
|
|
|
|
n_col = len(modified_table.columns)
|
2023-04-07 12:18:26 -07:00
|
|
|
num_columns_modified = NormalDistribution(n_col / 2, n_col / 2)
|
2023-03-29 17:13:43 -04:00
|
|
|
upstream_tables = _sample_list(all_tables, upstream_tables_per_operation)
|
|
|
|
|
|
|
|
all_columns = [
|
|
|
|
FieldAccess(column, table)
|
|
|
|
for table in upstream_tables
|
|
|
|
for column in table.columns
|
|
|
|
]
|
|
|
|
yield Query(
|
2023-04-14 21:09:43 -04:00
|
|
|
text=random.choice(query_texts),
|
2023-03-29 17:13:43 -04:00
|
|
|
type=random.choice(OPERATION_TYPES),
|
|
|
|
actor=random.choice(users),
|
|
|
|
timestamp=_random_time_between(
|
|
|
|
seed_metadata.start_time, seed_metadata.end_time
|
|
|
|
),
|
2023-04-07 12:18:26 -07:00
|
|
|
# Can have no field accesses, e.g. on a standard INSERT
|
|
|
|
fields_accessed=_sample_list(all_columns, num_columns_modified, 0),
|
2023-03-29 17:13:43 -04:00
|
|
|
object_modified=modified_table,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-12-05 12:33:00 -05:00
|
|
|
def _container_type(i: int) -> str:
|
|
|
|
if i == 0:
|
|
|
|
return "database"
|
|
|
|
elif i == 1:
|
|
|
|
return "schema"
|
|
|
|
else:
|
|
|
|
return f"{i}container"
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_table(
|
|
|
|
i: int, parents: List[Container], columns_per_table: Distribution
|
|
|
|
) -> Table:
|
|
|
|
num_columns = columns_per_table.sample(floor=1)
|
|
|
|
|
|
|
|
columns = OrderedDict({ID_COLUMN: Column(ID_COLUMN, ColumnType.INTEGER, False)})
|
|
|
|
for j in range(num_columns):
|
|
|
|
name = f"column_{j}"
|
|
|
|
columns[name] = Column(
|
|
|
|
name=name,
|
2023-10-04 10:23:13 -04:00
|
|
|
type=random.choice(list(ColumnType)),
|
|
|
|
nullable=random.random() < 0.1, # Fixed 10% chance for now
|
|
|
|
)
|
2023-12-05 12:33:00 -05:00
|
|
|
return Table(
|
|
|
|
f"table_{i}",
|
|
|
|
container=random.choice(parents),
|
|
|
|
columns=columns,
|
|
|
|
upstreams=[],
|
|
|
|
)
|
2023-10-04 10:23:13 -04:00
|
|
|
|
|
|
|
|
2023-04-07 12:18:26 -07:00
|
|
|
def _sample_list(lst: List[T], dist: NormalDistribution, floor: int = 1) -> List[T]:
|
2023-12-05 12:33:00 -05:00
|
|
|
return random.sample(lst, min(dist.sample(floor=floor), len(lst)))
|
2023-03-29 17:13:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
def _random_time_between(start: datetime, end: datetime) -> datetime:
|
|
|
|
return start + timedelta(seconds=(end - start).total_seconds() * random.random())
|
2023-12-05 12:33:00 -05:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
z = generate_data(10, 1000, 10)
|