feat(ingest): add classification for sql sources (#10013)

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
Mayuri Nehate 2024-03-12 21:53:20 +05:30 committed by GitHub
parent 28f16aabb3
commit 2de0e62ac4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 989 additions and 1043 deletions

View File

@ -102,6 +102,10 @@ sqlglot_lib = {
"acryl-sqlglot==22.3.1.dev3",
}
classification_lib = {
"acryl-datahub-classify==0.0.9",
}
sql_common = (
{
# Required for all SQL sources.
@ -121,6 +125,7 @@ sql_common = (
}
| usage_common
| sqlglot_lib
| classification_lib
)
sqllineage_lib = {
@ -190,8 +195,7 @@ snowflake_common = {
"pandas",
"cryptography",
"msal",
"acryl-datahub-classify==0.0.9",
}
} | classification_lib
trino = {
"trino[sqlalchemy]>=0.308",

View File

@ -233,6 +233,10 @@ class ClassificationHandler:
f"Skipping column {dataset_name}.{schema_field.fieldPath} from classification"
)
continue
# TODO: Let's auto-skip passing sample_data for complex(array/struct) columns
# for initial rollout
column_infos.append(
ColumnInfo(
metadata=Metadata(
@ -243,9 +247,11 @@ class ClassificationHandler:
"Dataset_Name": dataset_name,
}
),
values=sample_data[schema_field.fieldPath]
if schema_field.fieldPath in sample_data.keys()
else [],
values=(
sample_data[schema_field.fieldPath]
if schema_field.fieldPath in sample_data.keys()
else []
),
)
)

View File

@ -0,0 +1,136 @@
import logging
from abc import abstractmethod
from collections import defaultdict
from typing import Any, Dict, List, Union
import sqlalchemy as sa
from sqlalchemy.engine import Connection, Engine
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.engine.row import LegacyRow
from datahub.ingestion.api.closeable import Closeable
logger: logging.Logger = logging.getLogger(__name__)
class DataReader(Closeable):
@abstractmethod
def get_sample_data_for_column(
self, table_id: List[str], column_name: str, sample_size: int = 100
) -> list:
pass
@abstractmethod
def get_sample_data_for_table(
self, table_id: List[str], sample_size: int = 100
) -> Dict[str, list]:
pass
class SqlAlchemyTableDataReader(DataReader):
@staticmethod
def create(inspector: Inspector) -> "SqlAlchemyTableDataReader":
return SqlAlchemyTableDataReader(conn=inspector.bind)
def __init__(
self,
conn: Union[Engine, Connection],
) -> None:
# TODO: How can this use a connection pool instead ?
self.engine = conn.engine.connect()
def _table(self, table_id: List[str]) -> sa.Table:
return sa.Table(
table_id[-1],
sa.MetaData(),
schema=table_id[-2] if len(table_id) > 1 else None,
)
def get_sample_data_for_column(
self, table_id: List[str], column_name: str, sample_size: int = 100
) -> list:
"""
Fetches non-null column values, upto <sample_size> count
Args:
table_id: Table name identifier. One of
- [<db_name>, <schema_name>, <table_name>] or
- [<schema_name>, <table_name>] or
- [<table_name>]
column: Column name
Returns:
list of column values
"""
table = self._table(table_id)
query: Any
ignore_null_condition = sa.column(column_name).is_(None)
# limit doesn't compile properly for oracle so we will append rownum to query string later
if self.engine.dialect.name.lower() == "oracle":
raw_query = (
sa.select([sa.column(column_name)])
.select_from(table)
.where(sa.not_(ignore_null_condition))
)
query = str(
raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})
)
query += "\nAND ROWNUM <= %d" % sample_size
else:
query = (
sa.select([sa.column(column_name)])
.select_from(table)
.where(sa.not_(ignore_null_condition))
.limit(sample_size)
)
query_results = self.engine.execute(query)
return [x[column_name] for x in query_results.fetchall()]
def get_sample_data_for_table(
self, table_id: List[str], sample_size: int = 100
) -> Dict[str, list]:
"""
Fetches table values, upto <sample_size>*1.2 count
Args:
table_id: Table name identifier. One of
- [<db_name>, <schema_name>, <table_name>] or
- [<schema_name>, <table_name>] or
- [<table_name>]
Returns:
dictionary of (column name -> list of column values)
"""
column_values: Dict[str, list] = defaultdict(list)
table = self._table(table_id)
# Ideally we do not want null values in sample data for a column.
# However that would require separate query per column and
# that would be expensiv. To compensate for possibility
# of some null values in collected sample, we fetch extra (20% more)
# rows than configured sample_size.
sample_size = int(sample_size * 1.2)
query: Any
# limit doesn't compile properly for oracle so we will append rownum to query string later
if self.engine.dialect.name.lower() == "oracle":
raw_query = sa.select([sa.text("*")]).select_from(table)
query = str(
raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})
)
query += "\nAND ROWNUM <= %d" % sample_size
else:
query = sa.select([sa.text("*")]).select_from(table).limit(sample_size)
query_results = self.engine.execute(query)
# Not ideal - creates a parallel structure in column_values. Can we use pandas here ?
for row in query_results.fetchall():
if isinstance(row, LegacyRow):
for col, col_value in row.items():
column_values[col].append(col_value)
return column_values
def close(self) -> None:
self.engine.close()

View File

@ -1,3 +1,4 @@
import contextlib
import datetime
import logging
import traceback
@ -43,10 +44,18 @@ from datahub.ingestion.api.source import (
TestConnectionReport,
)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.glossary.classification_mixin import (
ClassificationHandler,
ClassificationReportMixin,
)
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
DatasetSubTypes,
)
from datahub.ingestion.source.sql.data_reader import (
DataReader,
SqlAlchemyTableDataReader,
)
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
from datahub.ingestion.source.sql.sql_utils import (
add_table_to_schema_container,
@ -120,7 +129,7 @@ MISSING_COLUMN_INFO = "missing column information"
@dataclass
class SQLSourceReport(StaleEntityRemovalSourceReport):
class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
tables_scanned: int = 0
views_scanned: int = 0
entities_profiled: int = 0
@ -314,6 +323,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
self.report: SQLSourceReport = SQLSourceReport()
self.profile_metadata_info: ProfileMetadata = ProfileMetadata()
self.classification_handler = ClassificationHandler(self.config, self.report)
config_report = {
config_option: config.dict().get(config_option)
for config_option in config_options_to_report
@ -643,6 +653,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
fk_dict["name"], foreign_fields, source_fields, foreign_dataset
)
def make_data_reader(self, inspector: Inspector) -> Optional[DataReader]:
"""
Subclasses can override this with source-specific data reader
if source provides clause to pick random sample instead of current
limit-based sample
"""
if (
self.classification_handler
and self.classification_handler.is_classification_enabled()
):
return SqlAlchemyTableDataReader.create(inspector)
return None
def loop_tables( # noqa: C901
self,
inspector: Inspector,
@ -650,31 +674,40 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
sql_config: SQLCommonConfig,
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
tables_seen: Set[str] = set()
try:
for table in inspector.get_table_names(schema):
dataset_name = self.get_identifier(
schema=schema, entity=table, inspector=inspector
)
if dataset_name not in tables_seen:
tables_seen.add(dataset_name)
else:
logger.debug(f"{dataset_name} has already been seen, skipping...")
continue
self.report.report_entity_scanned(dataset_name, ent_type="table")
if not sql_config.table_pattern.allowed(dataset_name):
self.report.report_dropped(dataset_name)
continue
try:
yield from self._process_table(
dataset_name, inspector, schema, table, sql_config
data_reader = self.make_data_reader(inspector)
with (data_reader or contextlib.nullcontext()):
try:
for table in inspector.get_table_names(schema):
dataset_name = self.get_identifier(
schema=schema, entity=table, inspector=inspector
)
except Exception as e:
self.warn(logger, f"{schema}.{table}", f"Ingestion error: {e}")
except Exception as e:
self.error(logger, f"{schema}", f"Tables error: {e}")
if dataset_name not in tables_seen:
tables_seen.add(dataset_name)
else:
logger.debug(
f"{dataset_name} has already been seen, skipping..."
)
continue
self.report.report_entity_scanned(dataset_name, ent_type="table")
if not sql_config.table_pattern.allowed(dataset_name):
self.report.report_dropped(dataset_name)
continue
try:
yield from self._process_table(
dataset_name,
inspector,
schema,
table,
sql_config,
data_reader,
)
except Exception as e:
self.warn(logger, f"{schema}.{table}", f"Ingestion error: {e}")
except Exception as e:
self.error(logger, f"{schema}", f"Tables error: {e}")
def add_information_for_schema(self, inspector: Inspector, schema: str) -> None:
pass
@ -691,6 +724,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
schema: str,
table: str,
sql_config: SQLCommonConfig,
data_reader: Optional[DataReader],
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
columns = self._get_columns(dataset_name, inspector, schema, table)
dataset_urn = make_dataset_urn_with_platform_instance(
@ -740,6 +774,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
foreign_keys,
schema_fields,
)
self._classify(dataset_name, schema, table, data_reader, schema_metadata)
dataset_snapshot.aspects.append(schema_metadata)
if self.config.include_view_lineage:
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
@ -770,6 +806,39 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
domain_registry=self.domain_registry,
)
def _classify(
self,
dataset_name: str,
schema: str,
table: str,
data_reader: Optional[DataReader],
schema_metadata: SchemaMetadata,
) -> None:
try:
if (
self.classification_handler.is_classification_enabled_for_table(
dataset_name
)
and data_reader
):
self.classification_handler.classify_schema_fields(
dataset_name,
schema_metadata,
data_reader.get_sample_data_for_table(
table_id=[schema, table],
sample_size=self.config.classification.sample_size,
),
)
except Exception as e:
logger.debug(
f"Failed to classify table columns for {dataset_name} due to error -> {e}",
exc_info=e,
)
self.report.report_warning(
"Failed to classify table columns",
dataset_name,
)
def get_database_properties(
self, inspector: Inspector, database: str
) -> Optional[Dict[str, str]]:

View File

@ -12,6 +12,9 @@ from datahub.configuration.source_common import (
LowerCaseDatasetUrnConfigMixin,
)
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.ingestion.glossary.classification_mixin import (
ClassificationSourceConfigMixin,
)
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StatefulStaleMetadataRemovalConfig,
@ -29,6 +32,7 @@ class SQLCommonConfig(
DatasetSourceConfigMixin,
LowerCaseDatasetUrnConfigMixin,
LineageConfig,
ClassificationSourceConfigMixin,
):
options: dict = pydantic.Field(
default_factory=dict,

View File

@ -35,6 +35,7 @@ from datahub.ingestion.api.decorators import (
)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.extractor import schema_util
from datahub.ingestion.source.sql.data_reader import DataReader
from datahub.ingestion.source.sql.sql_common import (
SQLAlchemySource,
SqlWorkUnit,
@ -334,9 +335,10 @@ class TrinoSource(SQLAlchemySource):
schema: str,
table: str,
sql_config: SQLCommonConfig,
data_reader: Optional[DataReader],
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
yield from super()._process_table(
dataset_name, inspector, schema, table, sql_config
dataset_name, inspector, schema, table, sql_config, data_reader
)
if self.config.ingest_lineage_to_connectors:
dataset_urn = make_dataset_urn_with_platform_instance(

View File

@ -24,6 +24,7 @@ from datahub.ingestion.api.decorators import (
support_status,
)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.sql.data_reader import DataReader
from datahub.ingestion.source.sql.sql_common import (
SQLAlchemySource,
SQLSourceReport,
@ -221,6 +222,7 @@ class VerticaSource(SQLAlchemySource):
schema: str,
table: str,
sql_config: SQLCommonConfig,
data_reader: Optional[DataReader],
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
dataset_urn = make_dataset_urn_with_platform_instance(
self.platform,
@ -235,7 +237,7 @@ class VerticaSource(SQLAlchemySource):
owner_urn=f"urn:li:corpuser:{table_owner}",
)
yield from super()._process_table(
dataset_name, inspector, schema, table, sql_config
dataset_name, inspector, schema, table, sql_config, data_reader
)
def loop_views(

View File

@ -197,6 +197,17 @@
},
"nativeDataType": "ENUM('M', 'F')",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Gender"
}
],
"auditStamp": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{
@ -1897,6 +1908,17 @@
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Email_Address"
}
],
"auditStamp": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{
@ -2192,10 +2214,17 @@
},
{
"fieldPath": "email_address",
"uniqueCount": 0,
"nullCount": 5,
"nullProportion": 1,
"sampleValues": []
"uniqueCount": 5,
"uniqueProportion": 1,
"nullCount": 0,
"nullProportion": 0.0,
"sampleValues": [
"Bedecs@xyz.com",
"Gratacos@xyz.com",
"Axen@xyz.com",
"Lee@xyz.com",
"Donnell@xyz.com"
]
},
{
"fieldPath": "priority",
@ -2728,258 +2757,5 @@
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,dataCharmer.employees,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302",
"urn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,dataCharmer.salaries,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302",
"urn": "urn:li:container:0f72a1bc79da282eb614cc089c0ba302"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:17751259af32dd0385cad799df608c40",
"urn": "urn:li:container:17751259af32dd0385cad799df608c40"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:17751259af32dd0385cad799df608c40",
"urn": "urn:li:container:17751259af32dd0385cad799df608c40"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:17751259af32dd0385cad799df608c40",
"urn": "urn:li:container:17751259af32dd0385cad799df608c40"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:28176129fe1c0e526e1803250ec124ef",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.myset,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:28176129fe1c0e526e1803250ec124ef",
"urn": "urn:li:container:28176129fe1c0e526e1803250ec124ef"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:28176129fe1c0e526e1803250ec124ef",
"urn": "urn:li:container:28176129fe1c0e526e1803250ec124ef"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -546,10 +546,17 @@
},
{
"fieldPath": "email_address",
"uniqueCount": 0,
"nullCount": 5,
"nullProportion": 1,
"sampleValues": []
"uniqueCount": 5,
"uniqueProportion": 1,
"nullCount": 0,
"nullProportion": 0.0,
"sampleValues": [
"Bedecs@xyz.com",
"Gratacos@xyz.com",
"Axen@xyz.com",
"Lee@xyz.com",
"Donnell@xyz.com"
]
},
{
"fieldPath": "priority",
@ -632,63 +639,5 @@
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-test",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -455,7 +455,8 @@
},
{
"fieldPath": "email_address",
"uniqueCount": 0,
"uniqueCount": 5,
"uniqueProportion": 1,
"nullCount": 0
},
{
@ -513,63 +514,5 @@
"runId": "mysql-2020_04_14-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -1,40 +0,0 @@
run_id: mysql-test
source:
type: mysql
config:
username: root
password: example
database: metagalaxy
host_port: localhost:53307
schema_pattern:
allow:
- "^metagalaxy"
- "^northwind"
- "^datacharmer"
- "^test_cases"
profile_pattern:
allow:
- "^northwind"
- "^datacharmer"
- "^test_cases"
profiling:
enabled: True
include_field_null_count: true
include_field_min_value: true
include_field_max_value: true
include_field_mean_value: true
include_field_median_value: true
include_field_stddev_value: true
include_field_quantiles: true
include_field_distinct_value_frequencies: true
include_field_histogram: true
include_field_sample_values: true
domain:
"urn:li:domain:sales":
allow:
- "^metagalaxy"
sink:
type: file
config:
filename: "./mysql_mces_dbalias.json"

View File

@ -34,6 +34,19 @@ source:
"urn:li:domain:sales":
allow:
- "^metagalaxy"
classification:
enabled: True
classifiers:
- type: datahub
config:
minimum_values_threshold: 1
info_types_config:
Full_Name:
prediction_factors_and_weights:
name: 0.5
description: 0
datatype: 0
values: 0.5
sink:
type: file
config:

View File

@ -249,11 +249,11 @@ USE `northwind`;
# Dumping data for table 'customers'
#
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (1, 'Company A', 'Bedecs', 'Anna', NULL, 4);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (2, 'Company B', 'Gratacos Solsona', 'Antonio', NULL, 4.9);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (3, 'Company C', 'Axen', 'Thomas', NULL, 4);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (4, 'Company D', 'Lee', 'Christina', NULL, 3.8);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (5, 'Company E', 'Donnell', 'Martin', NULL, NULL);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (1, 'Company A', 'Bedecs', 'Anna', 'Bedecs@xyz.com', 4);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (2, 'Company B', 'Gratacos Solsona', 'Antonio', 'Gratacos@xyz.com', 4.9);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (3, 'Company C', 'Axen', 'Thomas', 'Axen@xyz.com', 4);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (4, 'Company D', 'Lee', 'Christina', 'Lee@xyz.com', 3.8);
INSERT INTO `customers` (`id`, `company`, `last_name`, `first_name`, `email_address`, `priority`) VALUES (5, 'Company E', 'Donnell', 'Martin', 'Donnell@xyz.com', NULL);
# 5 records
-- -----------------------------------------------------

View File

@ -258,6 +258,17 @@
},
"nativeDataType": "VARCHAR(length=500)",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:URN"
}
],
"auditStamp": {
"time": 1646575200000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": true
},
{
@ -330,6 +341,17 @@
},
"nativeDataType": "VARCHAR(length=255)",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:URN"
}
],
"auditStamp": {
"time": 1646575200000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{

View File

@ -14,6 +14,24 @@ source:
turn_off_expensive_profiling_metrics: true
catch_exceptions: true
include_views: true
classification:
enabled: True
classifiers:
- type: datahub
config:
minimum_values_threshold: 1
info_types_config:
URN:
prediction_factors_and_weights:
name: 0
description: 0
datatype: 0
values: 1
values:
prediction_type: regex
regex:
- "^urn:li:.*:.*"
library: []
sink:
type: file
config:

View File

@ -56,4 +56,13 @@ CREATE TABLE db1.union_test(
foo UNIONTYPE<int, double, array<string>, struct<a:int,b:string>>
) STORED AS ORC ;
CREATE TABLE db1.map_test(KeyValue String, RecordId map<int,string>);
CREATE TABLE db1.map_test(KeyValue String, RecordId map<int,string>);
CREATE TABLE db1.classification_test(id STRING, name STRING, email STRING, gender STRING, age INT);
INSERT INTO
db1.classification_test
VALUES
("1", "Foo Bar", "foo@bar.com", "M", 21),
("2", "John Doe", "john.doe@example.com", "M", 30),
("3", "Jane Doe", "jane.doe@abc.com", "F", 27);

View File

@ -5,6 +5,11 @@ import requests
from freezegun import freeze_time
from datahub.configuration.common import AllowDenyPattern
from datahub.ingestion.glossary.classifier import (
ClassificationConfig,
DynamicTypedClassifierConfig,
)
from datahub.ingestion.glossary.datahub_classifier import DataHubClassifierConfig
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.sink.file import FileSinkConfig
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
@ -87,6 +92,18 @@ def test_trino_ingest(
include_field_histogram=True,
include_field_sample_values=True,
),
classification=ClassificationConfig(
enabled=True,
classifiers=[
DynamicTypedClassifierConfig(
type="datahub",
config=DataHubClassifierConfig(
minimum_values_threshold=1,
),
)
],
max_workers=1,
),
catalog_to_connector_details={
"postgresqldb": ConnectorDetail(
connector_database="postgres",
@ -131,6 +148,18 @@ def test_trino_hive_ingest(
database="hivedb",
username="foo",
schema_pattern=AllowDenyPattern(allow=["^db1"]),
classification=ClassificationConfig(
enabled=True,
classifiers=[
DynamicTypedClassifierConfig(
type="datahub",
config=DataHubClassifierConfig(
minimum_values_threshold=1,
),
)
],
max_workers=1,
),
).dict(),
},
"sink": {

View File

@ -244,7 +244,7 @@
"numrows": "1",
"rawdatasize": "32",
"totalsize": "33",
"transient_lastddltime": "1708925463"
"transient_lastddltime": "1710150034"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -471,6 +471,265 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
"numfiles": "1",
"numrows": "3",
"rawdatasize": "94",
"totalsize": "97",
"transient_lastddltime": "1710150038"
},
"name": "classification_test",
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.classification_test",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "name",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "email",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "gender",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "age",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"isPartOfKey": false
}
]
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:trino",
"instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Table"
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "siblings",
"aspect": {
"json": {
"siblings": [
"urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)"
],
"primary": true
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "siblings",
"aspect": {
"json": {
"siblings": [
"urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)"
],
"primary": false
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "upstreamLineage",
"aspect": {
"json": {
"upstreams": [
{
"auditStamp": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)",
"type": "VIEW"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.map_test,PROD)",
@ -505,7 +764,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710150036"
},
"name": "map_test",
"tags": []
@ -732,7 +991,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710150036"
},
"name": "nested_struct_test",
"tags": []
@ -1003,7 +1262,7 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1708925457"
"transient_lastddltime": "1710150028"
},
"name": "pokes",
"tags": []
@ -1238,7 +1497,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925459"
"transient_lastddltime": "1710150031"
},
"name": "struct_test",
"tags": []
@ -1489,7 +1748,7 @@
"customProperties": {
"numfiles": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710150036"
},
"name": "struct_test_view_materialized",
"tags": []
@ -1743,7 +2002,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925459"
"transient_lastddltime": "1710150031"
},
"name": "_test_table_underscore",
"tags": []
@ -1966,7 +2225,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710150036"
},
"name": "union_test",
"tags": []
@ -2268,7 +2527,7 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1708925466",
"transient_lastddltime": "1710150036",
"view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"",
"is_view": "True"
},
@ -2584,6 +2843,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1.map_test,PROD)",
@ -2679,312 +2954,5 @@
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.map_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.nested_struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.pokes,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.struct_test_view_materialized,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1._test_table_underscore,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.union_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:trino,production_warehouse)"
},
{
"id": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b",
"urn": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b"
},
{
"id": "urn:li:container:46baa6eebd802861e5ee3d043456e171",
"urn": "urn:li:container:46baa6eebd802861e5ee3d043456e171"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-instance-test",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -231,7 +231,7 @@
"numrows": "1",
"rawdatasize": "32",
"totalsize": "33",
"transient_lastddltime": "1708925463"
"transient_lastddltime": "1710149909"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -437,6 +437,288 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
"numfiles": "1",
"numrows": "3",
"rawdatasize": "94",
"totalsize": "97",
"transient_lastddltime": "1710149912"
},
"name": "classification_test",
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.classification_test",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "name",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Full_Name"
}
],
"auditStamp": {
"time": 1632398400000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{
"fieldPath": "email",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Email_Address"
}
],
"auditStamp": {
"time": 1632398400000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{
"fieldPath": "gender",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Gender"
}
],
"auditStamp": {
"time": 1632398400000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
},
{
"fieldPath": "age",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"glossaryTerms": {
"terms": [
{
"urn": "urn:li:glossaryTerm:Age"
}
],
"auditStamp": {
"time": 1632398400000,
"actor": "urn:li:corpuser:datahub"
}
},
"isPartOfKey": false
}
]
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Table"
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "siblings",
"aspect": {
"json": {
"siblings": [
"urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)"
],
"primary": true
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "siblings",
"aspect": {
"json": {
"siblings": [
"urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)"
],
"primary": false
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "upstreamLineage",
"aspect": {
"json": {
"upstreams": [
{
"auditStamp": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)",
"type": "VIEW"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)",
@ -471,7 +753,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710149911"
},
"name": "map_test",
"tags": []
@ -677,7 +959,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710149911"
},
"name": "nested_struct_test",
"tags": []
@ -927,7 +1209,7 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1708925457"
"transient_lastddltime": "1710149904"
},
"name": "pokes",
"tags": []
@ -1141,7 +1423,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925459"
"transient_lastddltime": "1710149906"
},
"name": "struct_test",
"tags": []
@ -1371,7 +1653,7 @@
"customProperties": {
"numfiles": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710149911"
},
"name": "struct_test_view_materialized",
"tags": []
@ -1604,7 +1886,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925459"
"transient_lastddltime": "1710149906"
},
"name": "_test_table_underscore",
"tags": []
@ -1806,7 +2088,7 @@
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1708925466"
"transient_lastddltime": "1710149911"
},
"name": "union_test",
"tags": []
@ -2087,7 +2369,7 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1708925466",
"transient_lastddltime": "1710149911",
"view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"",
"is_view": "True"
},
@ -2382,6 +2664,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.classification_test,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)",
@ -2477,267 +2775,5 @@
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.nested_struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.union_test,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7",
"urn": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7"
},
{
"id": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84",
"urn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84"
}
]
}
},
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"lastRunId": "no-run-id-provided"
}
}
]