mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-27 03:19:51 +00:00
175 lines
4.5 KiB
Python
175 lines
4.5 KiB
Python
![]() |
#!/usr/bin/env python3
|
||
|
import os
|
||
|
from datetime import datetime
|
||
|
from typing import Optional, Generator, Tuple
|
||
|
|
||
|
# import hashlib
|
||
|
|
||
|
HOUR_IN_MS = 3600000
|
||
|
DAY_IN_MS = 86400000
|
||
|
START_DAY_IN_MS = int(datetime.now().timestamp() * 1000) - 5 * DAY_IN_MS
|
||
|
|
||
|
CounterType = Optional[int]
|
||
|
NameType = Optional[str]
|
||
|
IndexRowType = Tuple[
|
||
|
NameType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
NameType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
CounterType,
|
||
|
]
|
||
|
|
||
|
|
||
|
def day(n: int) -> int:
|
||
|
return START_DAY_IN_MS + n * DAY_IN_MS
|
||
|
|
||
|
|
||
|
class MockIndexGenerator:
|
||
|
INDEX_NAME = "mock_dataset_stats_aspect_v1"
|
||
|
|
||
|
INDEX_FIELD_NAMES = [
|
||
|
"urn",
|
||
|
"rowCount",
|
||
|
"columnCount",
|
||
|
"columnStats.key",
|
||
|
"columnStats.numNull",
|
||
|
"eventTimestampMillis",
|
||
|
"eventGranularity",
|
||
|
"partitionSpec.parition",
|
||
|
"partitionSpec.timeWindow.startTimeMillis",
|
||
|
"partitionSpec.timeWindow.granulatiry",
|
||
|
]
|
||
|
|
||
|
INDEX_FIELD_TYPES = [
|
||
|
"keyword",
|
||
|
"long",
|
||
|
"long",
|
||
|
"keyword",
|
||
|
"long",
|
||
|
"date",
|
||
|
"long",
|
||
|
"keyword",
|
||
|
"date",
|
||
|
"long",
|
||
|
]
|
||
|
|
||
|
def __init__(self, start_days_in_ms, num_recs, num_cols):
|
||
|
self._start_days_in_ms = start_days_in_ms
|
||
|
self._num_recs = num_recs
|
||
|
self._num_cols = num_cols
|
||
|
self._stat_num_rows_start = 10000
|
||
|
self._stat_num_cols_start = 50
|
||
|
self._stat_num_nulls = 100
|
||
|
|
||
|
def _get_num_rows(self, i: int):
|
||
|
return self._stat_num_rows_start + (100 * i)
|
||
|
|
||
|
def _get_num_cols(self, i: int):
|
||
|
return self._stat_num_cols_start + i
|
||
|
|
||
|
def _get_num_nulls(self, i: int, c: int):
|
||
|
return self._stat_num_nulls + c + (10 * i)
|
||
|
|
||
|
def _get_event_time_ms(self, i: int):
|
||
|
return self._start_days_in_ms + (i * HOUR_IN_MS)
|
||
|
|
||
|
@staticmethod
|
||
|
def _get_index_row_json(row: IndexRowType) -> str:
|
||
|
return ",".join(
|
||
|
[
|
||
|
f'"{field}" : "{value}"'
|
||
|
for field, value in zip(MockIndexGenerator.INDEX_FIELD_NAMES, row)
|
||
|
if value is not None
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def get_records(self) -> Generator[IndexRowType, None, None]:
|
||
|
for i in range(self._num_recs):
|
||
|
# emit one table record
|
||
|
yield self._get_index_row_json((
|
||
|
"table_1",
|
||
|
self._get_num_rows(i),
|
||
|
self._get_num_cols(i),
|
||
|
None,
|
||
|
None,
|
||
|
self._get_event_time_ms(i),
|
||
|
HOUR_IN_MS,
|
||
|
None,
|
||
|
None,
|
||
|
None)
|
||
|
)
|
||
|
# emit one record per column
|
||
|
for c in range(self._num_cols):
|
||
|
yield self._get_index_row_json((
|
||
|
f"table_1",
|
||
|
None,
|
||
|
None,
|
||
|
f"col_{c}",
|
||
|
self._get_num_nulls(i, c),
|
||
|
self._get_event_time_ms(i),
|
||
|
HOUR_IN_MS,
|
||
|
None,
|
||
|
None,
|
||
|
None)
|
||
|
)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_props_json() -> str:
|
||
|
return ",".join(
|
||
|
[
|
||
|
f'"{field}" : {{ "type" : "{type}" }}'
|
||
|
for field, type in zip(
|
||
|
MockIndexGenerator.INDEX_FIELD_NAMES,
|
||
|
MockIndexGenerator.INDEX_FIELD_TYPES,
|
||
|
)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
|
||
|
def gen_index_schema() -> None:
|
||
|
properties_json = MockIndexGenerator.get_props_json()
|
||
|
index_schema_gen_cmd = (
|
||
|
f"curl -v -XPUT http://localhost:9200/{MockIndexGenerator.INDEX_NAME} -H 'Content-Type: application/json' -d '"
|
||
|
+ """
|
||
|
{
|
||
|
"settings":{},
|
||
|
"mappings":{
|
||
|
"properties":{ """
|
||
|
+ f"{properties_json}"
|
||
|
+ """
|
||
|
}
|
||
|
}
|
||
|
}'"""
|
||
|
)
|
||
|
print(index_schema_gen_cmd)
|
||
|
os.system(index_schema_gen_cmd)
|
||
|
|
||
|
|
||
|
def populate_index_data() -> None:
|
||
|
for id, row in enumerate(
|
||
|
MockIndexGenerator(START_DAY_IN_MS, 100, 20).get_records()
|
||
|
):
|
||
|
# id = hashlib.md5(row.encode("utf-8")).hexdigest()
|
||
|
index_row_gen_command = (
|
||
|
f"curl -v -XPUT http://localhost:9200/{MockIndexGenerator.INDEX_NAME}/_doc/{id} "
|
||
|
+ "-H 'Content-Type: application/json' -d '{ "
|
||
|
+ f"{row}"
|
||
|
+ " }'"
|
||
|
)
|
||
|
print(index_row_gen_command)
|
||
|
os.system(index_row_gen_command)
|
||
|
|
||
|
|
||
|
def generate() -> None:
|
||
|
#gen_index_schema()
|
||
|
populate_index_data()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
generate()
|