mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-16 13:16:52 +00:00
feat(ingest): Starburst Trino usage (#3558)
This commit is contained in:
parent
658fa81406
commit
a36fefaa33
@ -65,6 +65,7 @@ Sources:
|
|||||||
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
||||||
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
|
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
|
||||||
| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source |
|
| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source |
|
||||||
|
| [starburst-trino-usage](./source_docs/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source |
|
||||||
|
|
||||||
Sinks
|
Sinks
|
||||||
|
|
||||||
|
@ -135,6 +135,14 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
# PR is from same author as that of sqlalchemy-trino library below.
|
# PR is from same author as that of sqlalchemy-trino library below.
|
||||||
"sqlalchemy-trino"
|
"sqlalchemy-trino"
|
||||||
},
|
},
|
||||||
|
"starburst-trino-usage": sql_common
|
||||||
|
| {
|
||||||
|
# SQLAlchemy support is coming up in trino python client
|
||||||
|
# subject to PR merging - https://github.com/trinodb/trino-python-client/pull/81.
|
||||||
|
# PR is from same author as that of sqlalchemy-trino library below.
|
||||||
|
"sqlalchemy-trino"
|
||||||
|
},
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
all_exclude_plugins: Set[str] = {
|
all_exclude_plugins: Set[str] = {
|
||||||
@ -209,7 +217,7 @@ if is_py37_or_newer:
|
|||||||
# The trino plugin only works on Python 3.7 or newer.
|
# The trino plugin only works on Python 3.7 or newer.
|
||||||
# The trino plugin can be supported on Python 3.6 with minimal changes to opensource sqlalchemy-trino sourcecode.
|
# The trino plugin can be supported on Python 3.6 with minimal changes to opensource sqlalchemy-trino sourcecode.
|
||||||
base_dev_requirements = base_dev_requirements.union(
|
base_dev_requirements = base_dev_requirements.union(
|
||||||
{dependency for plugin in ["lookml", "trino"] for dependency in plugins[plugin]}
|
{dependency for plugin in ["lookml", "trino", "starburst-trino-usage"] for dependency in plugins[plugin]}
|
||||||
)
|
)
|
||||||
|
|
||||||
dev_requirements = {
|
dev_requirements = {
|
||||||
@ -281,6 +289,8 @@ entry_points = {
|
|||||||
"superset = datahub.ingestion.source.superset:SupersetSource",
|
"superset = datahub.ingestion.source.superset:SupersetSource",
|
||||||
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
|
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
|
||||||
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
|
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
|
||||||
|
"starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
|
||||||
|
|
||||||
],
|
],
|
||||||
"datahub.ingestion.sink.plugins": [
|
"datahub.ingestion.sink.plugins": [
|
||||||
"file = datahub.ingestion.sink.file:FileSink",
|
"file = datahub.ingestion.sink.file:FileSink",
|
||||||
|
@ -69,6 +69,61 @@ As a SQL-based service, the Trino integration is also supported by our SQL profi
|
|||||||
|
|
||||||
Coming soon!
|
Coming soon!
|
||||||
|
|
||||||
|
## Trino Usage Stats
|
||||||
|
|
||||||
|
For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
|
||||||
|
|
||||||
|
### Starburst Trino Usage Stats
|
||||||
|
If you are using Starburst Trino you can collect usage stats the following way.
|
||||||
|
#### Prerequsities
|
||||||
|
1. You need to setup Event Logger which saves audit logs into a Postgres db and setup this db as a catalog in Trino
|
||||||
|
Here you can find more info about how to setup:
|
||||||
|
https://docs.starburst.io/354-e/security/event-logger.html#security-event-logger--page-root
|
||||||
|
https://docs.starburst.io/354-e/security/event-logger.html#analyzing-the-event-log
|
||||||
|
|
||||||
|
2. Install starbust-trino-usage plugin
|
||||||
|
Run pip install 'acryl-datahub[starburst-trino-usage]'.
|
||||||
|
|
||||||
|
#### Usage stats ingestion job
|
||||||
|
Here is a sample recipe to ingest usage data:
|
||||||
|
```
|
||||||
|
source:
|
||||||
|
type: starburst-trino-usage
|
||||||
|
config:
|
||||||
|
# Coordinates
|
||||||
|
host_port: yourtrinohost:port
|
||||||
|
# The name of the catalog from getting the usage
|
||||||
|
database: hive
|
||||||
|
# Credentials
|
||||||
|
username: trino_username
|
||||||
|
password: trino_password
|
||||||
|
email_domain: test.com
|
||||||
|
audit_catalog: audit
|
||||||
|
audit_schema: audit_schema
|
||||||
|
|
||||||
|
sink:
|
||||||
|
type: "datahub-rest"
|
||||||
|
config:
|
||||||
|
server: "http://localhost:8080"
|
||||||
|
```
|
||||||
|
### Config details
|
||||||
|
|
||||||
|
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||||
|
|
||||||
|
By default, we extract usage stats for the last day, with the recommendation that this source is executed every day.
|
||||||
|
|
||||||
|
| Field | Required | Default | Description |
|
||||||
|
| ---------------------- | -------- | -------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `database` | yes | | The name of the catalog from getting the usage |
|
||||||
|
| `audit_catalog` | yes | | The catalog name where the audit table can be found |
|
||||||
|
| `audit_schema` | yes | | The schema name where the audit table can be found |
|
||||||
|
| `email_domain` | yes | | The email domain which will be appended to the users |
|
||||||
|
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
|
||||||
|
| `bucket_duration` | | `"DAY"` | Duration to bucket usage events by. Can be `"DAY"` or `"HOUR"`. |
|
||||||
|
| `start_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Earliest date of usage logs to consider. |
|
||||||
|
| `end_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Latest date of usage logs to consider. |
|
||||||
|
| `top_n_queries` | | `10` | Number of top queries to save to each table. |
|
||||||
|
|
||||||
## Questions
|
## Questions
|
||||||
|
|
||||||
If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
|
If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
|
||||||
|
@ -418,9 +418,17 @@ class SQLAlchemySource(Source):
|
|||||||
self.report.report_dropped(dataset_name)
|
self.report.report_dropped(dataset_name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
columns = inspector.get_columns(table, schema)
|
columns = inspector.get_columns(table, schema)
|
||||||
if len(columns) == 0:
|
if len(columns) == 0:
|
||||||
self.report.report_warning(dataset_name, "missing column information")
|
self.report.report_warning(
|
||||||
|
dataset_name, "missing column information"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.report.report_warning(
|
||||||
|
dataset_name,
|
||||||
|
f"unable to get column information due to an error -> {e}",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# SQLALchemy stubs are incomplete and missing this method.
|
# SQLALchemy stubs are incomplete and missing this method.
|
||||||
|
@ -0,0 +1,253 @@
|
|||||||
|
import collections
|
||||||
|
import dataclasses
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from email.utils import parseaddr
|
||||||
|
from typing import Dict, Iterable, List
|
||||||
|
|
||||||
|
from dateutil import parser
|
||||||
|
from pydantic import Field
|
||||||
|
from pydantic.main import BaseModel
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.engine import Engine
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 7): # noqa: C901
|
||||||
|
import datahub.emitter.mce_builder as builder
|
||||||
|
from datahub.configuration.time_window_config import get_time_bucket
|
||||||
|
from datahub.ingestion.api.source import Source, SourceReport
|
||||||
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
|
from datahub.ingestion.source.sql.trino import TrinoConfig
|
||||||
|
from datahub.ingestion.source.usage.usage_common import (
|
||||||
|
BaseUsageConfig,
|
||||||
|
GenericAggregatedDataset,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
trino_datetime_format = "%Y-%m-%d %H:%M:%S.%f %Z"
|
||||||
|
|
||||||
|
# Qeurying Starburst completed queries table
|
||||||
|
# https://docs.starburst.io/latest/security/event-logger.html#completed-queries
|
||||||
|
trino_usage_sql_comment = """
|
||||||
|
SELECT DISTINCT usr,
|
||||||
|
query,
|
||||||
|
"catalog",
|
||||||
|
"schema",
|
||||||
|
query_type,
|
||||||
|
accessed_metadata,
|
||||||
|
create_time,
|
||||||
|
end_time
|
||||||
|
FROM {audit_catalog}.{audit_schema}.completed_queries
|
||||||
|
WHERE 1 = 1
|
||||||
|
AND query_type = 'SELECT'
|
||||||
|
AND create_time >= timestamp '{start_time}'
|
||||||
|
AND end_time < timestamp '{end_time}'
|
||||||
|
AND query_state = 'FINISHED'
|
||||||
|
ORDER BY end_time desc
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
TrinoTableRef = str
|
||||||
|
AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
|
||||||
|
|
||||||
|
class TrinoConnectorInfo(BaseModel):
|
||||||
|
partitionIds: List[str]
|
||||||
|
truncated: bool
|
||||||
|
|
||||||
|
class TrinoAccessedMetadata(BaseModel):
|
||||||
|
catalog_name: str = Field(None, alias="catalogName")
|
||||||
|
schema_name: str = Field(None, alias="schema") # type: ignore
|
||||||
|
table: str = None # type: ignore
|
||||||
|
columns: List[str]
|
||||||
|
connector_info: TrinoConnectorInfo = Field(None, alias="connectorInfo")
|
||||||
|
|
||||||
|
class TrinoJoinedAccessEvent(BaseModel):
|
||||||
|
usr: str = None # type:ignore
|
||||||
|
query: str = None # type: ignore
|
||||||
|
catalog: str = None # type: ignore
|
||||||
|
schema_name: str = Field(None, alias="schema")
|
||||||
|
query_type: str = None # type:ignore
|
||||||
|
table: str = None # type:ignore
|
||||||
|
accessed_metadata: List[TrinoAccessedMetadata]
|
||||||
|
starttime: datetime = Field(None, alias="create_time")
|
||||||
|
endtime: datetime = Field(None, alias="end_time")
|
||||||
|
|
||||||
|
class TrinoUsageConfig(TrinoConfig, BaseUsageConfig):
|
||||||
|
env: str = builder.DEFAULT_ENV
|
||||||
|
email_domain: str
|
||||||
|
audit_catalog: str
|
||||||
|
audit_schema: str
|
||||||
|
options: dict = {}
|
||||||
|
|
||||||
|
def get_sql_alchemy_url(self):
|
||||||
|
return super().get_sql_alchemy_url()
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class TrinoUsageSource(Source):
|
||||||
|
config: TrinoUsageConfig
|
||||||
|
report: SourceReport = dataclasses.field(default_factory=SourceReport)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict, ctx):
|
||||||
|
config = TrinoUsageConfig.parse_obj(config_dict)
|
||||||
|
return cls(ctx, config)
|
||||||
|
|
||||||
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||||
|
access_events = self._get_trino_history()
|
||||||
|
# If the query results is empty, we don't want to proceed
|
||||||
|
if not access_events:
|
||||||
|
return []
|
||||||
|
|
||||||
|
joined_access_event = self._get_joined_access_event(access_events)
|
||||||
|
aggregated_info = self._aggregate_access_events(joined_access_event)
|
||||||
|
|
||||||
|
for time_bucket in aggregated_info.values():
|
||||||
|
for aggregate in time_bucket.values():
|
||||||
|
wu = self._make_usage_stat(aggregate)
|
||||||
|
self.report.report_workunit(wu)
|
||||||
|
yield wu
|
||||||
|
|
||||||
|
def _make_usage_query(self) -> str:
|
||||||
|
return trino_usage_sql_comment.format(
|
||||||
|
audit_catalog=self.config.audit_catalog,
|
||||||
|
audit_schema=self.config.audit_schema,
|
||||||
|
start_time=self.config.start_time.strftime(trino_datetime_format),
|
||||||
|
end_time=self.config.end_time.strftime(trino_datetime_format),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _make_sql_engine(self) -> Engine:
|
||||||
|
url = self.config.get_sql_alchemy_url()
|
||||||
|
logger.debug(f"sql_alchemy_url = {url}")
|
||||||
|
engine = create_engine(url, **self.config.options)
|
||||||
|
return engine
|
||||||
|
|
||||||
|
def _get_trino_history(self):
|
||||||
|
query = self._make_usage_query()
|
||||||
|
engine = self._make_sql_engine()
|
||||||
|
results = engine.execute(query)
|
||||||
|
events = []
|
||||||
|
for row in results:
|
||||||
|
# minor type conversion
|
||||||
|
if hasattr(row, "_asdict"):
|
||||||
|
event_dict = row._asdict()
|
||||||
|
else:
|
||||||
|
event_dict = dict(row)
|
||||||
|
|
||||||
|
# stripping extra spaces caused by above _asdict() conversion
|
||||||
|
for k, v in event_dict.items():
|
||||||
|
if isinstance(v, str):
|
||||||
|
event_dict[k] = v.strip()
|
||||||
|
|
||||||
|
if event_dict.get("starttime", None):
|
||||||
|
event_dict["starttime"] = event_dict.get("starttime").__str__()
|
||||||
|
if event_dict.get("endtime", None):
|
||||||
|
event_dict["endtime"] = event_dict.get("endtime").__str__()
|
||||||
|
|
||||||
|
logger.debug(f"event_dict: {event_dict}")
|
||||||
|
events.append(event_dict)
|
||||||
|
|
||||||
|
if events:
|
||||||
|
return events
|
||||||
|
|
||||||
|
# SQL results can be empty. If results is empty, the SQL connection closes.
|
||||||
|
# Then, we don't want to proceed ingestion.
|
||||||
|
logging.info("SQL Result is empty")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _convert_str_to_datetime(self, v):
|
||||||
|
if isinstance(v, str):
|
||||||
|
isodate = parser.parse(v) # compatible with Python 3.6+
|
||||||
|
return isodate
|
||||||
|
|
||||||
|
def _get_joined_access_event(self, events):
|
||||||
|
joined_access_events = []
|
||||||
|
for event_dict in events:
|
||||||
|
event_dict["create_time"] = self._convert_str_to_datetime(
|
||||||
|
event_dict.get("create_time")
|
||||||
|
)
|
||||||
|
|
||||||
|
event_dict["end_time"] = self._convert_str_to_datetime(
|
||||||
|
event_dict.get("end_time")
|
||||||
|
)
|
||||||
|
|
||||||
|
if not event_dict["accessed_metadata"]:
|
||||||
|
logging.info("Field accessed_metadata is empty. Skipping ....")
|
||||||
|
continue
|
||||||
|
|
||||||
|
event_dict["accessed_metadata"] = json.loads(
|
||||||
|
event_dict["accessed_metadata"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not event_dict.get("usr"):
|
||||||
|
logging.info("The username parameter is missing. Skipping ....")
|
||||||
|
continue
|
||||||
|
|
||||||
|
joined_access_events.append(TrinoJoinedAccessEvent(**event_dict))
|
||||||
|
return joined_access_events
|
||||||
|
|
||||||
|
def _aggregate_access_events(
|
||||||
|
self, events: List[TrinoJoinedAccessEvent]
|
||||||
|
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
|
||||||
|
datasets: Dict[
|
||||||
|
datetime, Dict[TrinoTableRef, AggregatedDataset]
|
||||||
|
] = collections.defaultdict(dict)
|
||||||
|
|
||||||
|
for event in events:
|
||||||
|
floored_ts = get_time_bucket(
|
||||||
|
event.starttime, self.config.bucket_duration
|
||||||
|
)
|
||||||
|
for metadata in event.accessed_metadata:
|
||||||
|
|
||||||
|
# Skipping queries starting with $system@
|
||||||
|
if metadata.catalog_name.startswith("$system@"):
|
||||||
|
logging.debug(
|
||||||
|
f"Skipping system query for {metadata.catalog_name}..."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filtering down queries to the selected catalog
|
||||||
|
if metadata.catalog_name != self.config.database:
|
||||||
|
continue
|
||||||
|
|
||||||
|
resource = f"{metadata.catalog_name}.{metadata.schema_name}.{metadata.table}"
|
||||||
|
|
||||||
|
agg_bucket = datasets[floored_ts].setdefault(
|
||||||
|
resource,
|
||||||
|
AggregatedDataset(
|
||||||
|
bucket_start_time=floored_ts, resource=resource
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# add @unknown.com to username
|
||||||
|
# current limitation in user stats UI, we need to provide email to show users
|
||||||
|
if "@" in parseaddr(event.usr)[1]:
|
||||||
|
username = event.usr
|
||||||
|
else:
|
||||||
|
username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}"
|
||||||
|
|
||||||
|
agg_bucket.add_read_entry(
|
||||||
|
username,
|
||||||
|
event.query,
|
||||||
|
metadata.columns,
|
||||||
|
)
|
||||||
|
return datasets
|
||||||
|
|
||||||
|
def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit:
|
||||||
|
return agg.make_usage_workunit(
|
||||||
|
self.config.bucket_duration,
|
||||||
|
lambda resource: builder.make_dataset_urn(
|
||||||
|
"trino", resource.lower(), self.config.env
|
||||||
|
),
|
||||||
|
self.config.top_n_queries,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_report(self) -> SourceReport:
|
||||||
|
return self.report
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ModuleNotFoundError("The trino usage plugin requires Python 3.7 or newer.")
|
@ -0,0 +1,97 @@
|
|||||||
|
import json
|
||||||
|
import pathlib
|
||||||
|
import sys
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from freezegun import freeze_time
|
||||||
|
|
||||||
|
from datahub.ingestion.run.pipeline import Pipeline
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 7): # noqa: C901
|
||||||
|
from datahub.ingestion.source.usage.starburst_trino_usage import TrinoUsageConfig
|
||||||
|
from tests.test_helpers import mce_helpers
|
||||||
|
|
||||||
|
FROZEN_TIME = "2021-08-24 09:00:00"
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.version_info < (3, 7), reason="trino requires Python 3.7+")
|
||||||
|
def test_trino_usage_config():
|
||||||
|
config = TrinoUsageConfig.parse_obj(
|
||||||
|
dict(
|
||||||
|
host_port="xxxxx",
|
||||||
|
database="testcatalog",
|
||||||
|
username="xxxxx",
|
||||||
|
password="xxxxx",
|
||||||
|
email_domain="xxxxx",
|
||||||
|
audit_catalog="xxxxx",
|
||||||
|
audit_schema="xxxxx",
|
||||||
|
include_views=True,
|
||||||
|
include_tables=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert config.host_port == "xxxxx"
|
||||||
|
assert config.database == "testcatalog"
|
||||||
|
assert config.username == "xxxxx"
|
||||||
|
assert config.email_domain == "xxxxx"
|
||||||
|
assert config.audit_catalog == "xxxxx"
|
||||||
|
assert config.audit_schema == "xxxxx"
|
||||||
|
assert config.include_views
|
||||||
|
assert config.include_tables
|
||||||
|
|
||||||
|
def yield_function(li):
|
||||||
|
for i in li:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
@freeze_time(FROZEN_TIME)
|
||||||
|
@pytest.mark.skipif(sys.version_info < (3, 7), reason="trino requires Python 3.7+")
|
||||||
|
def test_trino_usage_source(pytestconfig, tmp_path):
|
||||||
|
|
||||||
|
test_resources_dir = pathlib.Path(
|
||||||
|
pytestconfig.rootpath / "tests/integration/starburst-trino-usage"
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"datahub.ingestion.source.usage.starburst_trino_usage.TrinoUsageSource._get_trino_history"
|
||||||
|
) as mock_event_history:
|
||||||
|
access_events = load_access_events(test_resources_dir)
|
||||||
|
mock_event_history.return_value = access_events
|
||||||
|
|
||||||
|
# Run ingestion
|
||||||
|
pipeline = Pipeline.create(
|
||||||
|
{
|
||||||
|
"run_id": "test-trino-usage",
|
||||||
|
"source": {
|
||||||
|
"type": "starburst-trino-usage",
|
||||||
|
"config": {
|
||||||
|
"host_port": "xxxxx",
|
||||||
|
"database": "testcatalog",
|
||||||
|
"username": "xxxxx",
|
||||||
|
"password": "xxxxx",
|
||||||
|
"audit_catalog": "test",
|
||||||
|
"audit_schema": "test",
|
||||||
|
"email_domain": "acryl.io",
|
||||||
|
"include_views": True,
|
||||||
|
"include_tables": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"sink": {
|
||||||
|
"type": "file",
|
||||||
|
"config": {"filename": f"{tmp_path}/trino_usages.json"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipeline.run()
|
||||||
|
pipeline.raise_from_status()
|
||||||
|
|
||||||
|
mce_helpers.check_golden_file(
|
||||||
|
pytestconfig=pytestconfig,
|
||||||
|
output_path=tmp_path / "trino_usages.json",
|
||||||
|
golden_path=test_resources_dir / "trino_usages_golden.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_access_events(test_resources_dir):
|
||||||
|
access_events_history_file = test_resources_dir / "usage_events_history.json"
|
||||||
|
with access_events_history_file.open() as access_events_json:
|
||||||
|
access_events = json.loads(access_events_json.read())
|
||||||
|
return access_events
|
@ -0,0 +1,15 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,testcatalog.testschema.testtable,PROD)",
|
||||||
|
"entityKeyAspect": null,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetUsageStatistics",
|
||||||
|
"aspect": {
|
||||||
|
"value": "{\"timestampMillis\": 1634169600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 2, \"topSqlQueries\": [\"select * from testcatalog.testschema.testtable limit 100\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:test-name\", \"count\": 2, \"userEmail\": \"test-name@acryl.io\"}], \"fieldCounts\": [{\"fieldPath\": \"column1\", \"count\": 2}, {\"fieldPath\": \"column2\", \"count\": 2}]}",
|
||||||
|
"contentType": "application/json"
|
||||||
|
},
|
||||||
|
"systemMetadata": null
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"usr" : "test-name",
|
||||||
|
"query" : "select * from testcatalog.testschema.testtable limit 100",
|
||||||
|
"catalog" : null,
|
||||||
|
"schema" : null,
|
||||||
|
"query_type" : "SELECT",
|
||||||
|
"accessed_metadata" : "[{\"catalogName\":\"testcatalog\",\"schema\":\"testschema\",\"table\":\"testtable\",\"columns\":[\"column1\",\"column2\"],\"physicalInputBytes\":1673886,\"physicalInputRows\":4754}]",
|
||||||
|
"create_time" : "2021-10-14 09:40:53.108000 UTC",
|
||||||
|
"end_time" : "2021-10-14 09:40:55.214000 UTC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"usr" : "test-name@acryl.io",
|
||||||
|
"query" : "select * from testcatalog.testschema.testtable limit 100",
|
||||||
|
"catalog" : null,
|
||||||
|
"schema" : null,
|
||||||
|
"query_type" : "SELECT",
|
||||||
|
"accessed_metadata" : "[{\"catalogName\":\"testcatalog\",\"schema\":\"testschema\",\"table\":\"testtable\",\"columns\":[\"column1\",\"column2\"],\"physicalInputBytes\":1673886,\"physicalInputRows\":4754}]",
|
||||||
|
"create_time" : "2021-10-14 09:40:53.108000 UTC",
|
||||||
|
"end_time" : "2021-10-14 09:40:55.214000 UTC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"usr" : "test-name",
|
||||||
|
"query" : "select * from testcatalog.testschema.testtable limit 100",
|
||||||
|
"catalog" : null,
|
||||||
|
"schema" : null,
|
||||||
|
"query_type" : "SELECT",
|
||||||
|
"accessed_metadata": null,
|
||||||
|
"create_time" : "2021-10-14 09:40:53.108000 UTC",
|
||||||
|
"end_time" : "2021-10-14 09:40:55.214000 UTC"
|
||||||
|
}
|
||||||
|
]
|
Loading…
x
Reference in New Issue
Block a user