datahub/metadata-ingestion/tests/performance/data_generation.py

"""
Generates data for performance testing of warehouse sources.

In the future, we could try to create a more realistic dataset
by anonymizing and reduplicating a production datahub instance's data.
We could also get more human data by using Faker.

This is a work in progress, built piecemeal as needed.
"""
import random
import uuid
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Iterable, List, TypeVar, Union, cast

from faker import Faker

from tests.performance.data_model import (
    Column,
    ColumnMapping,
    ColumnType,
    Container,
    FieldAccess,
    Query,
    StatementType,
    Table,
    View,
)

T = TypeVar("T")

OPERATION_TYPES: List[StatementType] = [
    "INSERT",
    "UPDATE",
    "DELETE",
    "CREATE",
    "ALTER",
    "DROP",
    "CUSTOM",
    "UNKNOWN",
]


@dataclass(frozen=True)
class NormalDistribution:
    mu: float
    sigma: float

    def sample(self) -> int:
        return int(random.gauss(mu=self.mu, sigma=self.sigma))

    def sample_with_floor(self, floor: int = 1) -> int:
        return max(int(random.gauss(mu=self.mu, sigma=self.sigma)), floor)


@dataclass
class SeedMetadata:
    # Each list is a layer of containers, e.g. [[databases], [schemas]]
    containers: List[List[Container]]

    tables: List[Table]
    views: List[View]
    start_time: datetime
    end_time: datetime

    @property
    def all_tables(self) -> List[Table]:
        return self.tables + cast(List[Table], self.views)


def generate_data(
    num_containers: Union[List[int], int],
    num_tables: int,
    num_views: int,
    columns_per_table: NormalDistribution = NormalDistribution(5, 2),
    parents_per_view: NormalDistribution = NormalDistribution(2, 1),
    view_definition_length: NormalDistribution = NormalDistribution(150, 50),
    time_range: timedelta = timedelta(days=14),
) -> SeedMetadata:
    # Assemble containers
    if isinstance(num_containers, int):
        num_containers = [num_containers]

    containers: List[List[Container]] = []
    for i, num_in_layer in enumerate(num_containers):
        layer = [
            Container(
                f"{i}-container-{j}",
                parent=random.choice(containers[-1]) if containers else None,
            )
            for j in range(num_in_layer)
        ]
        containers.append(layer)

    # Assemble tables
    tables = [
        Table(
            f"table-{i}",
            container=random.choice(containers[-1]),
            columns=[
                f"column-{j}-{uuid.uuid4()}"
                for j in range(columns_per_table.sample_with_floor())
            ],
            column_mapping=None,
        )
        for i in range(num_tables)
    ]
    views = [
        View(
            f"view-{i}",
            container=random.choice(containers[-1]),
            columns=[
                f"column-{j}-{uuid.uuid4()}"
                for j in range(columns_per_table.sample_with_floor())
            ],
            column_mapping=None,
            definition=f"{uuid.uuid4()}-{'*' * view_definition_length.sample_with_floor(10)}",
            parents=random.sample(tables, parents_per_view.sample_with_floor()),
        )
        for i in range(num_views)
    ]

    for table in tables + views:
        _generate_column_mapping(table)

    now = datetime.now(tz=timezone.utc)
    return SeedMetadata(
        containers=containers,
        tables=tables,
        views=views,
        start_time=now - time_range,
        end_time=now,
    )


def generate_queries(
    seed_metadata: SeedMetadata,
    num_selects: int,
    num_operations: int,
    num_unique_queries: int,
    num_users: int,
    tables_per_select: NormalDistribution = NormalDistribution(3, 5),
    columns_per_select: NormalDistribution = NormalDistribution(10, 5),
    upstream_tables_per_operation: NormalDistribution = NormalDistribution(2, 2),
    query_length: NormalDistribution = NormalDistribution(100, 50),
) -> Iterable[Query]:
    faker = Faker()
    query_texts = [
        faker.paragraph(query_length.sample_with_floor(30) // 30)
        for _ in range(num_unique_queries)
    ]

    all_tables = seed_metadata.tables + seed_metadata.views
    users = [f"user-{i}@xyz.com" for i in range(num_users)]
    for i in range(num_selects):  # Pure SELECT statements
        tables = _sample_list(all_tables, tables_per_select)
        all_columns = [
            FieldAccess(column, table) for table in tables for column in table.columns
        ]
        yield Query(
            text=random.choice(query_texts),
            type="SELECT",
            actor=random.choice(users),
            timestamp=_random_time_between(
                seed_metadata.start_time, seed_metadata.end_time
            ),
            fields_accessed=_sample_list(all_columns, columns_per_select),
        )

    for i in range(num_operations):
        modified_table = random.choice(seed_metadata.tables)
        n_col = len(modified_table.columns)
        num_columns_modified = NormalDistribution(n_col / 2, n_col / 2)
        upstream_tables = _sample_list(all_tables, upstream_tables_per_operation)

        all_columns = [
            FieldAccess(column, table)
            for table in upstream_tables
            for column in table.columns
        ]
        yield Query(
            text=random.choice(query_texts),
            type=random.choice(OPERATION_TYPES),
            actor=random.choice(users),
            timestamp=_random_time_between(
                seed_metadata.start_time, seed_metadata.end_time
            ),
            # Can have no field accesses, e.g. on a standard INSERT
            fields_accessed=_sample_list(all_columns, num_columns_modified, 0),
            object_modified=modified_table,
        )


def _generate_column_mapping(table: Table) -> ColumnMapping:
    d = {}
    for column in table.columns:
        d[column] = Column(
            name=column,
            type=random.choice(list(ColumnType)),
            nullable=random.random() < 0.1,  # Fixed 10% chance for now
        )
    table.column_mapping = d
    return d


def _sample_list(lst: List[T], dist: NormalDistribution, floor: int = 1) -> List[T]:
    return random.sample(lst, min(dist.sample_with_floor(floor), len(lst)))


def _random_time_between(start: datetime, end: datetime) -> datetime:
    return start + timedelta(seconds=(end - start).total_seconds() * random.random())
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`"""`
			`Generates data for performance testing of warehouse sources.`

			`In the future, we could try to create a more realistic dataset`
			`by anonymizing and reduplicating a production datahub instance's data.`
			`We could also get more human data by using Faker.`

			`This is a work in progress, built piecemeal as needed.`
			`"""`
			`import random`
			`import uuid`
			`from dataclasses import dataclass`
			`from datetime import datetime, timedelta, timezone`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`from typing import Iterable, List, TypeVar, Union, cast`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00
perf(ingest/bigquery): Improve bigquery usage disk usage and speed (#7825) 2023-04-14 21:09:43 -04:00			`from faker import Faker`

test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`from tests.performance.data_model import (`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`Column,`
			`ColumnMapping,`
			`ColumnType,`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`Container,`
			`FieldAccess,`
			`Query,`
			`StatementType,`
			`Table,`
			`View,`
			`)`

			`T = TypeVar("T")`

			`OPERATION_TYPES: List[StatementType] = [`
			`"INSERT",`
			`"UPDATE",`
			`"DELETE",`
			`"CREATE",`
			`"ALTER",`
			`"DROP",`
			`"CUSTOM",`
			`"UNKNOWN",`
			`]`


			`@dataclass(frozen=True)`
			`class NormalDistribution:`
			`mu: float`
			`sigma: float`

			`def sample(self) -> int:`
			`return int(random.gauss(mu=self.mu, sigma=self.sigma))`

			`def sample_with_floor(self, floor: int = 1) -> int:`
			`return max(int(random.gauss(mu=self.mu, sigma=self.sigma)), floor)`


			`@dataclass`
			`class SeedMetadata:`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`# Each list is a layer of containers, e.g. [[databases], [schemas]]`
			`containers: List[List[Container]]`

test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`tables: List[Table]`
			`views: List[View]`
			`start_time: datetime`
			`end_time: datetime`

test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`@property`
			`def all_tables(self) -> List[Table]:`
			`return self.tables + cast(List[Table], self.views)`

test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00
			`def generate_data(`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`num_containers: Union[List[int], int],`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`num_tables: int,`
			`num_views: int,`
			`columns_per_table: NormalDistribution = NormalDistribution(5, 2),`
			`parents_per_view: NormalDistribution = NormalDistribution(2, 1),`
			`view_definition_length: NormalDistribution = NormalDistribution(150, 50),`
			`time_range: timedelta = timedelta(days=14),`
			`) -> SeedMetadata:`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`# Assemble containers`
			`if isinstance(num_containers, int):`
			`num_containers = [num_containers]`

			`containers: List[List[Container]] = []`
			`for i, num_in_layer in enumerate(num_containers):`
			`layer = [`
			`Container(`
			`f"{i}-container-{j}",`
			`parent=random.choice(containers[-1]) if containers else None,`
			`)`
			`for j in range(num_in_layer)`
			`]`
			`containers.append(layer)`

			`# Assemble tables`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`tables = [`
			`Table(`
			`f"table-{i}",`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`container=random.choice(containers[-1]),`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`columns=[`
			`f"column-{j}-{uuid.uuid4()}"`
			`for j in range(columns_per_table.sample_with_floor())`
			`],`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`column_mapping=None,`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`)`
			`for i in range(num_tables)`
			`]`
			`views = [`
			`View(`
			`f"view-{i}",`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`container=random.choice(containers[-1]),`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`columns=[`
			`f"column-{j}-{uuid.uuid4()}"`
			`for j in range(columns_per_table.sample_with_floor())`
			`],`
test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`column_mapping=None,`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`definition=f"{uuid.uuid4()}-{'' view_definition_length.sample_with_floor(10)}",`
			`parents=random.sample(tables, parents_per_view.sample_with_floor()),`
			`)`
			`for i in range(num_views)`
			`]`

test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`for table in tables + views:`
			`_generate_column_mapping(table)`

test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`now = datetime.now(tz=timezone.utc)`
			`return SeedMetadata(`
			`containers=containers,`
			`tables=tables,`
			`views=views,`
			`start_time=now - time_range,`
			`end_time=now,`
			`)`


			`def generate_queries(`
			`seed_metadata: SeedMetadata,`
			`num_selects: int,`
			`num_operations: int,`
perf(ingest/bigquery): Improve bigquery usage disk usage and speed (#7825) 2023-04-14 21:09:43 -04:00			`num_unique_queries: int,`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`num_users: int,`
			`tables_per_select: NormalDistribution = NormalDistribution(3, 5),`
			`columns_per_select: NormalDistribution = NormalDistribution(10, 5),`
			`upstream_tables_per_operation: NormalDistribution = NormalDistribution(2, 2),`
			`query_length: NormalDistribution = NormalDistribution(100, 50),`
			`) -> Iterable[Query]:`
perf(ingest/bigquery): Improve bigquery usage disk usage and speed (#7825) 2023-04-14 21:09:43 -04:00			`faker = Faker()`
			`query_texts = [`
			`faker.paragraph(query_length.sample_with_floor(30) // 30)`
			`for _ in range(num_unique_queries)`
			`]`

test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`all_tables = seed_metadata.tables + seed_metadata.views`
			`users = [f"user-{i}@xyz.com" for i in range(num_users)]`
			`for i in range(num_selects): # Pure SELECT statements`
			`tables = _sample_list(all_tables, tables_per_select)`
			`all_columns = [`
			`FieldAccess(column, table) for table in tables for column in table.columns`
			`]`
			`yield Query(`
perf(ingest/bigquery): Improve bigquery usage disk usage and speed (#7825) 2023-04-14 21:09:43 -04:00			`text=random.choice(query_texts),`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`type="SELECT",`
			`actor=random.choice(users),`
			`timestamp=_random_time_between(`
			`seed_metadata.start_time, seed_metadata.end_time`
			`),`
			`fields_accessed=_sample_list(all_columns, columns_per_select),`
			`)`

			`for i in range(num_operations):`
			`modified_table = random.choice(seed_metadata.tables)`
			`n_col = len(modified_table.columns)`
fix(ingest/bigquery): Support cross project usage using FileBackedDict (#7663) Includes major refactor of bigquery usage ingestion, minor refactor of the source as a whole, and reporting cleanup. Includes bigquery performance testing changes. 2023-04-07 12:18:26 -07:00			`num_columns_modified = NormalDistribution(n_col / 2, n_col / 2)`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`upstream_tables = _sample_list(all_tables, upstream_tables_per_operation)`

			`all_columns = [`
			`FieldAccess(column, table)`
			`for table in upstream_tables`
			`for column in table.columns`
			`]`
			`yield Query(`
perf(ingest/bigquery): Improve bigquery usage disk usage and speed (#7825) 2023-04-14 21:09:43 -04:00			`text=random.choice(query_texts),`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`type=random.choice(OPERATION_TYPES),`
			`actor=random.choice(users),`
			`timestamp=_random_time_between(`
			`seed_metadata.start_time, seed_metadata.end_time`
			`),`
fix(ingest/bigquery): Support cross project usage using FileBackedDict (#7663) Includes major refactor of bigquery usage ingestion, minor refactor of the source as a whole, and reporting cleanup. Includes bigquery performance testing changes. 2023-04-07 12:18:26 -07:00			`# Can have no field accesses, e.g. on a standard INSERT`
			`fields_accessed=_sample_list(all_columns, num_columns_modified, 0),`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00			`object_modified=modified_table,`
			`)`


test(ingest/unity): Add Unity Catalog memory performance testing (#8932) 2023-10-04 10:23:13 -04:00			`def _generate_column_mapping(table: Table) -> ColumnMapping:`
			`d = {}`
			`for column in table.columns:`
			`d[column] = Column(`
			`name=column,`
			`type=random.choice(list(ColumnType)),`
			`nullable=random.random() < 0.1, # Fixed 10% chance for now`
			`)`
			`table.column_mapping = d`
			`return d`


fix(ingest/bigquery): Support cross project usage using FileBackedDict (#7663) Includes major refactor of bigquery usage ingestion, minor refactor of the source as a whole, and reporting cleanup. Includes bigquery performance testing changes. 2023-04-07 12:18:26 -07:00			`def _sample_list(lst: List[T], dist: NormalDistribution, floor: int = 1) -> List[T]:`
			`return random.sample(lst, min(dist.sample_with_floor(floor), len(lst)))`
test(ingest/bigquery): Add performance testing framework for bigquery usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state 2023-03-29 17:13:43 -04:00

			`def _random_time_between(start: datetime, end: datetime) -> datetime:`
			`return start + timedelta(seconds=(end - start).total_seconds() * random.random())`