mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-17 04:53:46 +00:00
feat(ingest): add trino source to datahub (#3307)
This commit is contained in:
parent
3d8e22ea34
commit
d22d1c2795
@ -62,6 +62,7 @@ Sources:
|
|||||||
| [sql-profiles](./source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems |
|
| [sql-profiles](./source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems |
|
||||||
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
||||||
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
|
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
|
||||||
|
| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source |
|
||||||
|
|
||||||
Sinks
|
Sinks
|
||||||
|
|
||||||
|
|||||||
@ -117,6 +117,13 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"sqlalchemy": sql_common,
|
"sqlalchemy": sql_common,
|
||||||
"sql-profiles": sql_common | {"great-expectations"},
|
"sql-profiles": sql_common | {"great-expectations"},
|
||||||
"superset": {"requests"},
|
"superset": {"requests"},
|
||||||
|
"trino": sql_common
|
||||||
|
| {
|
||||||
|
# SQLAlchemy support is coming up in trino python client
|
||||||
|
# subject to PR merging - https://github.com/trinodb/trino-python-client/pull/81.
|
||||||
|
# PR is from same author as that of sqlalchemy-trino library below.
|
||||||
|
"sqlalchemy-trino"
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
all_exclude_plugins: Set[str] = {
|
all_exclude_plugins: Set[str] = {
|
||||||
@ -187,8 +194,10 @@ base_dev_requirements = {
|
|||||||
|
|
||||||
if is_py37_or_newer:
|
if is_py37_or_newer:
|
||||||
# The lookml plugin only works on Python 3.7 or newer.
|
# The lookml plugin only works on Python 3.7 or newer.
|
||||||
|
# The trino plugin only works on Python 3.7 or newer.
|
||||||
|
# The trino plugin can be supported on Python 3.6 with minimal changes to opensource sqlalchemy-trino sourcecode.
|
||||||
base_dev_requirements = base_dev_requirements.union(
|
base_dev_requirements = base_dev_requirements.union(
|
||||||
{dependency for plugin in ["lookml"] for dependency in plugins[plugin]}
|
{dependency for plugin in ["lookml", "trino"] for dependency in plugins[plugin]}
|
||||||
)
|
)
|
||||||
|
|
||||||
dev_requirements = {
|
dev_requirements = {
|
||||||
@ -255,6 +264,7 @@ entry_points = {
|
|||||||
"snowflake = datahub.ingestion.source.sql.snowflake:SnowflakeSource",
|
"snowflake = datahub.ingestion.source.sql.snowflake:SnowflakeSource",
|
||||||
"snowflake-usage = datahub.ingestion.source.usage.snowflake_usage:SnowflakeUsageSource",
|
"snowflake-usage = datahub.ingestion.source.usage.snowflake_usage:SnowflakeUsageSource",
|
||||||
"superset = datahub.ingestion.source.superset:SupersetSource",
|
"superset = datahub.ingestion.source.superset:SupersetSource",
|
||||||
|
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
|
||||||
],
|
],
|
||||||
"datahub.ingestion.sink.plugins": [
|
"datahub.ingestion.sink.plugins": [
|
||||||
"file = datahub.ingestion.sink.file:FileSink",
|
"file = datahub.ingestion.sink.file:FileSink",
|
||||||
|
|||||||
@ -42,6 +42,7 @@ Supported SQL sources:
|
|||||||
- [Redshift](./redshift.md)
|
- [Redshift](./redshift.md)
|
||||||
- [Snowflake](./snowflake.md)
|
- [Snowflake](./snowflake.md)
|
||||||
- [Generic SQLAlchemy source](./sqlalchemy.md)
|
- [Generic SQLAlchemy source](./sqlalchemy.md)
|
||||||
|
- [Trino](./trino.md)
|
||||||
|
|
||||||
## Quickstart recipe
|
## Quickstart recipe
|
||||||
|
|
||||||
|
|||||||
74
metadata-ingestion/source_docs/trino.md
Normal file
74
metadata-ingestion/source_docs/trino.md
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
# Trino
|
||||||
|
|
||||||
|
For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
To install this plugin, run `pip install 'acryl-datahub[trino]'`.
|
||||||
|
|
||||||
|
Note! This plugin uses a package that requires Python 3.7+!
|
||||||
|
|
||||||
|
## Capabilities
|
||||||
|
|
||||||
|
This plugin extracts the following:
|
||||||
|
|
||||||
|
- Metadata for databases, schemas, and tables
|
||||||
|
- Column types and schema associated with each table
|
||||||
|
- Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md)
|
||||||
|
|
||||||
|
## Quickstart recipe
|
||||||
|
|
||||||
|
Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
|
||||||
|
|
||||||
|
For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
|
||||||
|
|
||||||
|
```yml
|
||||||
|
source:
|
||||||
|
type: trino
|
||||||
|
config:
|
||||||
|
# Coordinates
|
||||||
|
host_port: localhost:5300
|
||||||
|
database: dbname
|
||||||
|
|
||||||
|
# Credentials
|
||||||
|
username: foo
|
||||||
|
password: datahub
|
||||||
|
|
||||||
|
sink:
|
||||||
|
# sink configs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Config details
|
||||||
|
|
||||||
|
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||||
|
|
||||||
|
As a SQL-based service, the Athena integration is also supported by our SQL profiler. See [here](./sql_profiles.md) for more details on configuration.
|
||||||
|
|
||||||
|
| Field | Required | Default | Description |
|
||||||
|
| --------------------------- | -------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `username` | ✅ | | Trino username. |
|
||||||
|
| `password` | | | Trino password. |
|
||||||
|
| `host_port` | ✅ | `"localhost:3306"` | Trino host URL. |
|
||||||
|
| `database` | ✅ | | Trino database (catalog). |
|
||||||
|
| `database_alias` | | | Alias to apply to database when ingesting. |
|
||||||
|
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
|
||||||
|
| `options.<option>` | | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||||
|
| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. |
|
||||||
|
| `table_pattern.deny` | | | List of regex patterns for tables to exclude from ingestion. |
|
||||||
|
| `table_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||||
|
| `schema_pattern.allow` | | | List of regex patterns for schemas to include in ingestion. |
|
||||||
|
| `schema_pattern.deny` | | | List of regex patterns for schemas to exclude from ingestion. |
|
||||||
|
| `schema_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||||
|
| `view_pattern.allow` | | | List of regex patterns for views to include in ingestion. |
|
||||||
|
| `view_pattern.deny` | | | List of regex patterns for views to exclude from ingestion. |
|
||||||
|
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||||
|
| `include_tables` | | `True` | Whether tables should be ingested. |
|
||||||
|
| `include_views` | | `True` | Whether views should be ingested. |
|
||||||
|
|
||||||
|
## Compatibility
|
||||||
|
|
||||||
|
Coming soon!
|
||||||
|
|
||||||
|
## Questions
|
||||||
|
|
||||||
|
If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
|
||||||
114
metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
Normal file
114
metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
import sys
|
||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
|
from trino.exceptions import TrinoQueryError # noqa
|
||||||
|
|
||||||
|
from datahub.ingestion.source.sql.sql_common import (
|
||||||
|
BasicSQLAlchemyConfig,
|
||||||
|
SQLAlchemySource,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 7):
|
||||||
|
# This import verifies that the dependencies are available.
|
||||||
|
import sqlalchemy_trino # noqa: F401
|
||||||
|
from sqlalchemy import exc, sql
|
||||||
|
from sqlalchemy.engine import reflection
|
||||||
|
from sqlalchemy_trino import datatype, error
|
||||||
|
from sqlalchemy_trino.dialect import TrinoDialect
|
||||||
|
|
||||||
|
# Read only table names and skip view names, as view names will also be returned
|
||||||
|
# from get_view_names
|
||||||
|
@reflection.cache # type: ignore
|
||||||
|
def get_table_names(self, connection, schema: str = None, **kw): # type: ignore
|
||||||
|
schema = schema or self._get_default_schema_name(connection)
|
||||||
|
if schema is None:
|
||||||
|
raise exc.NoSuchTableError("schema is required")
|
||||||
|
query = dedent(
|
||||||
|
"""
|
||||||
|
SELECT "table_name"
|
||||||
|
FROM "information_schema"."tables"
|
||||||
|
WHERE "table_schema" = :schema and "table_type" != 'VIEW'
|
||||||
|
"""
|
||||||
|
).strip()
|
||||||
|
res = connection.execute(sql.text(query), schema=schema)
|
||||||
|
return [row.table_name for row in res]
|
||||||
|
|
||||||
|
# Include all table properties instead of only "comment" property
|
||||||
|
@reflection.cache # type: ignore
|
||||||
|
def get_table_comment(self, connection, table_name: str, schema: str = None, **kw): # type: ignore
|
||||||
|
try:
|
||||||
|
properties_table = self._get_full_table(f"{table_name}$properties", schema)
|
||||||
|
query = f"SELECT * FROM {properties_table}"
|
||||||
|
row = connection.execute(sql.text(query)).fetchone()
|
||||||
|
|
||||||
|
# Generate properties dictionary.
|
||||||
|
properties = {}
|
||||||
|
for col_name, col_value in row.items():
|
||||||
|
properties[col_name] = col_value
|
||||||
|
|
||||||
|
return {"text": properties.get("comment", None), "properties": properties}
|
||||||
|
except TrinoQueryError as e:
|
||||||
|
if e.error_name in (error.TABLE_NOT_FOUND):
|
||||||
|
return dict(text=None)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Include column comment
|
||||||
|
@reflection.cache # type: ignore
|
||||||
|
def _get_columns(self, connection, table_name, schema: str = None, **kw): # type: ignore
|
||||||
|
schema = schema or self._get_default_schema_name(connection)
|
||||||
|
query = dedent(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
"column_name",
|
||||||
|
"data_type",
|
||||||
|
"column_default",
|
||||||
|
UPPER("is_nullable") AS "is_nullable",
|
||||||
|
"comment"
|
||||||
|
FROM "information_schema"."columns"
|
||||||
|
WHERE "table_schema" = :schema
|
||||||
|
AND "table_name" = :table
|
||||||
|
ORDER BY "ordinal_position" ASC
|
||||||
|
"""
|
||||||
|
).strip()
|
||||||
|
res = connection.execute(sql.text(query), schema=schema, table=table_name)
|
||||||
|
columns = []
|
||||||
|
for record in res:
|
||||||
|
column = dict(
|
||||||
|
name=record.column_name,
|
||||||
|
type=datatype.parse_sqltype(record.data_type),
|
||||||
|
nullable=record.is_nullable == "YES",
|
||||||
|
default=record.column_default,
|
||||||
|
comment=record.comment,
|
||||||
|
)
|
||||||
|
columns.append(column)
|
||||||
|
return columns
|
||||||
|
|
||||||
|
TrinoDialect.get_table_names = get_table_names
|
||||||
|
TrinoDialect.get_table_comment = get_table_comment
|
||||||
|
TrinoDialect._get_columns = _get_columns
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ModuleNotFoundError("The trino plugin requires Python 3.7 or newer.")
|
||||||
|
|
||||||
|
|
||||||
|
class TrinoConfig(BasicSQLAlchemyConfig):
|
||||||
|
# defaults
|
||||||
|
scheme = "trino"
|
||||||
|
|
||||||
|
def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str:
|
||||||
|
regular = f"{schema}.{table}"
|
||||||
|
if self.database_alias:
|
||||||
|
return f"{self.database_alias}.{regular}"
|
||||||
|
if self.database:
|
||||||
|
return f"{self.database}.{regular}"
|
||||||
|
return regular
|
||||||
|
|
||||||
|
|
||||||
|
class TrinoSource(SQLAlchemySource):
|
||||||
|
def __init__(self, config, ctx):
|
||||||
|
super().__init__(config, ctx, "trino")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict, ctx):
|
||||||
|
config = TrinoConfig.parse_obj(config_dict)
|
||||||
|
return cls(config, ctx)
|
||||||
@ -0,0 +1,78 @@
|
|||||||
|
# Adapted from https://github.com/big-data-europe/docker-hive.
|
||||||
|
|
||||||
|
version: "3"
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
testtrino:
|
||||||
|
image: trinodb/trino
|
||||||
|
container_name: "testtrino"
|
||||||
|
ports:
|
||||||
|
- 5300:8080
|
||||||
|
volumes:
|
||||||
|
- ./setup/etc:/etc/trino
|
||||||
|
depends_on:
|
||||||
|
- "trinodb_postgres"
|
||||||
|
- "hive-metastore"
|
||||||
|
|
||||||
|
trinodb_postgres:
|
||||||
|
image: postgres:alpine
|
||||||
|
container_name: "trinodb_postgres"
|
||||||
|
environment:
|
||||||
|
POSTGRES_PASSWORD: datahub
|
||||||
|
volumes:
|
||||||
|
- ./setup/setup.sql:/docker-entrypoint-initdb.d/postgres_setup.sql
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
namenode:
|
||||||
|
image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8
|
||||||
|
volumes:
|
||||||
|
- namenode:/hadoop/dfs/name
|
||||||
|
environment:
|
||||||
|
- CLUSTER_NAME=test
|
||||||
|
env_file:
|
||||||
|
- ./setup/hadoop-hive.env
|
||||||
|
ports:
|
||||||
|
- "50070:50070"
|
||||||
|
datanode:
|
||||||
|
image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
|
||||||
|
volumes:
|
||||||
|
- datanode:/hadoop/dfs/data
|
||||||
|
env_file:
|
||||||
|
- ./setup/hadoop-hive.env
|
||||||
|
environment:
|
||||||
|
SERVICE_PRECONDITION: "namenode:50070"
|
||||||
|
ports:
|
||||||
|
- "50075:50075"
|
||||||
|
hive-server:
|
||||||
|
image: bde2020/hive:2.3.2-postgresql-metastore
|
||||||
|
container_name: "testhiveserver2"
|
||||||
|
env_file:
|
||||||
|
- ./setup/hadoop-hive.env
|
||||||
|
environment:
|
||||||
|
HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore"
|
||||||
|
SERVICE_PRECONDITION: "hive-metastore:9083"
|
||||||
|
ports:
|
||||||
|
- "10000:10000"
|
||||||
|
volumes:
|
||||||
|
- ./setup/hive_setup.sql:/hive_setup.sql
|
||||||
|
hive-metastore:
|
||||||
|
image: bde2020/hive:2.3.2-postgresql-metastore
|
||||||
|
container_name: "hive-metastore"
|
||||||
|
env_file:
|
||||||
|
- ./setup/hadoop-hive.env
|
||||||
|
command: /opt/hive/bin/hive --service metastore
|
||||||
|
environment:
|
||||||
|
SERVICE_PRECONDITION: "namenode:50070 datanode:50075 hive-metastore-postgresql:5432"
|
||||||
|
ports:
|
||||||
|
- "9083:9083"
|
||||||
|
hive-metastore-postgresql:
|
||||||
|
image: bde2020/hive-metastore-postgresql:2.3.0
|
||||||
|
# presto-coordinator:
|
||||||
|
# image: shawnzhu/prestodb:0.181
|
||||||
|
# ports:
|
||||||
|
# - "8080:8080"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
namenode:
|
||||||
|
datanode:
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
connector.name=hive
|
||||||
|
hive.metastore.uri=thrift://hive-metastore:9083
|
||||||
|
hive.translate-hive-views=true
|
||||||
|
#hive.legacy-hive-view-translation=true
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
connector.name=postgresql
|
||||||
|
connection-url=jdbc:postgresql://trinodb_postgres:5432/postgres
|
||||||
|
connection-user=postgres
|
||||||
|
connection-password=datahub
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
coordinator=true
|
||||||
|
node-scheduler.include-coordinator=true
|
||||||
|
http-server.http.port=8080
|
||||||
|
query.max-memory=5GB
|
||||||
|
query.max-memory-per-node=1GB
|
||||||
|
query.max-total-memory-per-node=2GB
|
||||||
|
discovery.uri=http://localhost:8080
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
-server
|
||||||
|
-Xmx16G
|
||||||
|
-XX:-UseBiasedLocking
|
||||||
|
-XX:+UseG1GC
|
||||||
|
-XX:G1HeapRegionSize=32M
|
||||||
|
-XX:+ExplicitGCInvokesConcurrent
|
||||||
|
-XX:+ExitOnOutOfMemoryError
|
||||||
|
-XX:+HeapDumpOnOutOfMemoryError
|
||||||
|
-XX:-OmitStackTraceInFastThrow
|
||||||
|
-XX:ReservedCodeCacheSize=512M
|
||||||
|
-XX:PerMethodRecompilationCutoff=10000
|
||||||
|
-XX:PerBytecodeRecompilationCutoff=10000
|
||||||
|
-Djdk.attach.allowAttachSelf=true
|
||||||
|
-Djdk.nio.maxCachedBufferSize=2000000
|
||||||
@ -0,0 +1 @@
|
|||||||
|
io.trino=INFO
|
||||||
@ -0,0 +1,2 @@
|
|||||||
|
node.environment=production
|
||||||
|
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
|
||||||
@ -0,0 +1,30 @@
|
|||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
|
||||||
|
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
|
||||||
|
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
|
||||||
|
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
|
||||||
|
|
||||||
|
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
|
||||||
|
CORE_CONF_hadoop_http_staticuser_user=root
|
||||||
|
CORE_CONF_hadoop_proxyuser_hue_hosts=*
|
||||||
|
CORE_CONF_hadoop_proxyuser_hue_groups=*
|
||||||
|
|
||||||
|
HDFS_CONF_dfs_webhdfs_enabled=true
|
||||||
|
HDFS_CONF_dfs_permissions_enabled=false
|
||||||
|
|
||||||
|
YARN_CONF_yarn_log___aggregation___enable=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
|
||||||
|
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
|
||||||
|
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
|
||||||
|
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
|
||||||
|
YARN_CONF_yarn_timeline___service_enabled=true
|
||||||
|
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
|
||||||
|
YARN_CONF_yarn_timeline___service_hostname=historyserver
|
||||||
|
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
|
||||||
|
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
|
||||||
|
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
|
||||||
@ -0,0 +1,46 @@
|
|||||||
|
CREATE DATABASE IF NOT EXISTS db1;
|
||||||
|
CREATE DATABASE IF NOT EXISTS db2;
|
||||||
|
-- Setup a "pokes" example table.
|
||||||
|
CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING) PARTITIONED BY (baz STRING);
|
||||||
|
LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes PARTITION (baz='dummy');
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING);
|
||||||
|
LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db2.pokes;
|
||||||
|
|
||||||
|
-- Setup a table with a special character.
|
||||||
|
CREATE TABLE IF NOT EXISTS db1.`_test_table_underscore` (foo INT, bar STRING);
|
||||||
|
|
||||||
|
-- Create tables with struct and array types.
|
||||||
|
-- From https://stackoverflow.com/questions/57491644/correct-usage-of-a-struct-in-hive.
|
||||||
|
CREATE TABLE IF NOT EXISTS db1.struct_test
|
||||||
|
(
|
||||||
|
property_id INT,
|
||||||
|
service STRUCT<
|
||||||
|
type: STRING
|
||||||
|
,provider: ARRAY<INT>
|
||||||
|
>
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS db1.array_struct_test
|
||||||
|
(
|
||||||
|
property_id INT COMMENT 'id of property',
|
||||||
|
service array<STRUCT<
|
||||||
|
type: STRING
|
||||||
|
,provider: ARRAY<INT>
|
||||||
|
>> COMMENT 'service types and providers'
|
||||||
|
) TBLPROPERTIES ('comment' = 'This table has array of structs', 'another.comment' = 'This table has no partitions');;
|
||||||
|
|
||||||
|
WITH
|
||||||
|
test_data as (
|
||||||
|
SELECT 989 property_id, array(NAMED_STRUCT('type','Cleaning','provider', ARRAY(587, 887)),
|
||||||
|
NAMED_STRUCT('type','Pricing','provider', ARRAY(932))
|
||||||
|
) AS service
|
||||||
|
--, array(4,5,5) AS ratings
|
||||||
|
)
|
||||||
|
INSERT INTO TABLE db1.array_struct_test
|
||||||
|
select * from test_data;
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW db1.struct_test_view_materialized as select * from db1.struct_test;
|
||||||
|
CREATE VIEW db1.array_struct_test_view as select * from db1.array_struct_test;
|
||||||
|
|
||||||
|
|
||||||
44
metadata-ingestion/tests/integration/trino/setup/setup.sql
Normal file
44
metadata-ingestion/tests/integration/trino/setup/setup.sql
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
CREATE SCHEMA librarydb;
|
||||||
|
|
||||||
|
CREATE TABLE librarydb.book (
|
||||||
|
id INTEGER NOT NULL,
|
||||||
|
name VARCHAR ( 50 ) NOT NULL,
|
||||||
|
author VARCHAR ( 50 ),
|
||||||
|
publisher VARCHAR (50),
|
||||||
|
PRIMARY KEY (id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE librarydb.member (
|
||||||
|
id INTEGER NOT NULL,
|
||||||
|
name VARCHAR ( 50 ) NOT NULL,
|
||||||
|
PRIMARY KEY (id)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE librarydb.issue_history (
|
||||||
|
book_id INTEGER NOT NULL,
|
||||||
|
member_id INTEGER NOT NULL,
|
||||||
|
issue_date DATE,
|
||||||
|
return_date DATE,
|
||||||
|
CONSTRAINT fk_book FOREIGN KEY(book_id) REFERENCES librarydb.book(id),
|
||||||
|
CONSTRAINT fk_member FOREIGN KEY(member_id) REFERENCES librarydb.member(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO librarydb.book (id, name, author) VALUES (1, 'Book 1', 'ABC');
|
||||||
|
INSERT INTO librarydb.book (id, name, author) VALUES (2, 'Book 2', 'PQR');
|
||||||
|
INSERT INTO librarydb.book (id, name, author) VALUES (3, 'Book 3', 'XYZ');
|
||||||
|
|
||||||
|
INSERT INTO librarydb.member(id, name) VALUES (1, 'Member 1');
|
||||||
|
INSERT INTO librarydb.member(id, name) VALUES (2, 'Member 2');
|
||||||
|
|
||||||
|
INSERT INTO librarydb.issue_history VALUES (1, 1, TO_DATE('2021-09-27','YYYY-MM-DD'), TO_DATE('2021-09-27','YYYY-MM-DD'));
|
||||||
|
INSERT INTO librarydb.issue_history VALUES (2, 2, TO_DATE('2021-09-27','YYYY-MM-DD'), NULL);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE VIEW librarydb.book_in_circulation as
|
||||||
|
SELECT b.id, b.name, b.author, b.publisher, i.member_id, i.issue_date FROM
|
||||||
|
librarydb.book b
|
||||||
|
JOIN
|
||||||
|
librarydb.issue_history i
|
||||||
|
on b.id=i.book_id
|
||||||
|
where i.return_date is null;
|
||||||
82
metadata-ingestion/tests/integration/trino/test_trino.py
Normal file
82
metadata-ingestion/tests/integration/trino/test_trino.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
from click.testing import CliRunner
|
||||||
|
from freezegun import freeze_time
|
||||||
|
|
||||||
|
from datahub.entrypoints import datahub
|
||||||
|
from tests.test_helpers import fs_helpers, mce_helpers
|
||||||
|
from tests.test_helpers.click_helpers import assert_result_ok
|
||||||
|
from tests.test_helpers.docker_helpers import wait_for_port
|
||||||
|
|
||||||
|
FROZEN_TIME = "2021-09-23 12:00:00"
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time(FROZEN_TIME)
|
||||||
|
@pytest.mark.skipif(sys.version_info < (3, 7), reason="trino requires Python 3.7+")
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
|
||||||
|
test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"
|
||||||
|
|
||||||
|
with docker_compose_runner(
|
||||||
|
test_resources_dir / "docker-compose.yml", "trino"
|
||||||
|
) as docker_services:
|
||||||
|
wait_for_port(docker_services, "testtrino", 8080)
|
||||||
|
wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)
|
||||||
|
|
||||||
|
docker_services.wait_until_responsive(
|
||||||
|
timeout=30,
|
||||||
|
pause=1,
|
||||||
|
check=lambda: requests.get("http://localhost:5300/v1/info").json()[
|
||||||
|
"starting"
|
||||||
|
]
|
||||||
|
is False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set up the hive db
|
||||||
|
command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
|
||||||
|
subprocess.run(command, shell=True, check=True)
|
||||||
|
|
||||||
|
# Run the metadata ingestion pipeline.
|
||||||
|
runner = CliRunner()
|
||||||
|
with fs_helpers.isolated_filesystem(tmp_path):
|
||||||
|
print(tmp_path)
|
||||||
|
|
||||||
|
# Run the metadata ingestion pipeline for trino catalog referring to postgres database
|
||||||
|
config_file = (test_resources_dir / "trino_to_file.yml").resolve()
|
||||||
|
result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
|
||||||
|
assert_result_ok(result)
|
||||||
|
|
||||||
|
# Verify the output.
|
||||||
|
mce_helpers.check_golden_file(
|
||||||
|
pytestconfig,
|
||||||
|
output_path="trino_mces.json",
|
||||||
|
golden_path=test_resources_dir / "trino_mces_golden.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
|
||||||
|
# This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070
|
||||||
|
|
||||||
|
# Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
|
||||||
|
# Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html
|
||||||
|
|
||||||
|
# Run the metadata ingestion pipeline for trino catalog referring to hive database
|
||||||
|
config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve()
|
||||||
|
result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
|
||||||
|
assert_result_ok(result)
|
||||||
|
|
||||||
|
# Verify the output.
|
||||||
|
mce_helpers.check_golden_file(
|
||||||
|
pytestconfig,
|
||||||
|
output_path="trino_hive_mces.json",
|
||||||
|
golden_path=test_resources_dir / "trino_hive_mces_golden.json",
|
||||||
|
ignore_paths=[
|
||||||
|
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
|
||||||
|
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
|
||||||
|
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Limitation 3 - Limited DatasetProperties available in Trino than in direct hive source - https://trino.io/docs/current/connector/hive.html#table-properties.
|
||||||
@ -0,0 +1,583 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
|
||||||
|
"comment": "This table has array of structs",
|
||||||
|
"another.comment": "This table has no partitions",
|
||||||
|
"numfiles": "4",
|
||||||
|
"numrows": "1",
|
||||||
|
"rawdatasize": "32",
|
||||||
|
"totalsize": "138",
|
||||||
|
"transient_lastddltime": "1633434492"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": "This table has array of structs",
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1.array_struct_test",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "property_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": "id of property",
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "service",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": "service types and providers",
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.ArrayType": {
|
||||||
|
"nestedType": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"transient_lastddltime": "1633435441"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": null,
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1.pokes",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "foo",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "bar",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "baz",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
|
||||||
|
"numfiles": "0",
|
||||||
|
"numrows": "0",
|
||||||
|
"rawdatasize": "0",
|
||||||
|
"totalsize": "0",
|
||||||
|
"transient_lastddltime": "1633434486"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": null,
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1.struct_test",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "property_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "service",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NullType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"numfiles": "0",
|
||||||
|
"totalsize": "0",
|
||||||
|
"transient_lastddltime": "1633434491"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": null,
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1.struct_test_view_materialized",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "property_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "service",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NullType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
|
||||||
|
"numfiles": "0",
|
||||||
|
"numrows": "0",
|
||||||
|
"rawdatasize": "0",
|
||||||
|
"totalsize": "0",
|
||||||
|
"transient_lastddltime": "1633434486"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": null,
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1._test_table_underscore",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "foo",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "bar",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||||
|
"customProperties": {
|
||||||
|
"transient_lastddltime": "1633434492",
|
||||||
|
"view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"",
|
||||||
|
"is_view": "True"
|
||||||
|
},
|
||||||
|
"externalUrl": null,
|
||||||
|
"description": null,
|
||||||
|
"uri": null,
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "hivedb.db1.array_struct_test_view",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "property_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "service",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.ArrayType": {
|
||||||
|
"nestedType": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-hive-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
run_id: trino-hive-test
|
||||||
|
|
||||||
|
source:
|
||||||
|
type: trino
|
||||||
|
config:
|
||||||
|
# Coordinates
|
||||||
|
host_port: localhost:5300
|
||||||
|
database: hivedb
|
||||||
|
|
||||||
|
# Credentials
|
||||||
|
username: foo
|
||||||
|
|
||||||
|
schema_pattern:
|
||||||
|
allow:
|
||||||
|
- "^db1"
|
||||||
|
|
||||||
|
sink:
|
||||||
|
type: file
|
||||||
|
config:
|
||||||
|
filename: "./trino_hive_mces.json"
|
||||||
@ -0,0 +1,498 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "library_catalog.librarydb.book",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "name",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "author",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "publisher",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "library_catalog.librarydb.issue_history",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "book_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "member_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "issue_date",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.DateType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "DATE()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "return_date",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.DateType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "DATE()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "library_catalog.librarydb.member",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "name",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"proposedSnapshot": {
|
||||||
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||||
|
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)",
|
||||||
|
"aspects": [
|
||||||
|
{
|
||||||
|
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||||
|
"schemaName": "library_catalog.librarydb.book_in_circulation",
|
||||||
|
"platform": "urn:li:dataPlatform:trino",
|
||||||
|
"version": 0,
|
||||||
|
"created": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"lastModified": {
|
||||||
|
"time": 0,
|
||||||
|
"actor": "urn:li:corpuser:unknown",
|
||||||
|
"impersonator": null
|
||||||
|
},
|
||||||
|
"deleted": null,
|
||||||
|
"dataset": null,
|
||||||
|
"cluster": null,
|
||||||
|
"hash": "",
|
||||||
|
"platformSchema": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||||
|
"tableSchema": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"fieldPath": "id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "name",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "author",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "publisher",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "VARCHAR(length=50)",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "member_id",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "INTEGER()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fieldPath": "issue_date",
|
||||||
|
"jsonPath": null,
|
||||||
|
"nullable": true,
|
||||||
|
"description": null,
|
||||||
|
"type": {
|
||||||
|
"type": {
|
||||||
|
"com.linkedin.pegasus2avro.schema.DateType": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nativeDataType": "DATE()",
|
||||||
|
"recursive": false,
|
||||||
|
"globalTags": null,
|
||||||
|
"glossaryTerms": null,
|
||||||
|
"isPartOfKey": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"primaryKeys": null,
|
||||||
|
"foreignKeysSpecs": null,
|
||||||
|
"foreignKeys": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"proposedDelta": null,
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1632398400000,
|
||||||
|
"runId": "trino-test",
|
||||||
|
"properties": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)",
|
||||||
|
"entityKeyAspect": null,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProfile",
|
||||||
|
"aspect": {
|
||||||
|
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 3, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}",
|
||||||
|
"contentType": "application/json"
|
||||||
|
},
|
||||||
|
"systemMetadata": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)",
|
||||||
|
"entityKeyAspect": null,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProfile",
|
||||||
|
"aspect": {
|
||||||
|
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||||
|
"contentType": "application/json"
|
||||||
|
},
|
||||||
|
"systemMetadata": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)",
|
||||||
|
"entityKeyAspect": null,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProfile",
|
||||||
|
"aspect": {
|
||||||
|
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}",
|
||||||
|
"contentType": "application/json"
|
||||||
|
},
|
||||||
|
"systemMetadata": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"auditHeader": null,
|
||||||
|
"entityType": "dataset",
|
||||||
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)",
|
||||||
|
"entityKeyAspect": null,
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "datasetProfile",
|
||||||
|
"aspect": {
|
||||||
|
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||||
|
"contentType": "application/json"
|
||||||
|
},
|
||||||
|
"systemMetadata": null
|
||||||
|
}
|
||||||
|
]
|
||||||
27
metadata-ingestion/tests/integration/trino/trino_to_file.yml
Normal file
27
metadata-ingestion/tests/integration/trino/trino_to_file.yml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
run_id: trino-test
|
||||||
|
|
||||||
|
source:
|
||||||
|
type: trino
|
||||||
|
config:
|
||||||
|
# Coordinates
|
||||||
|
host_port: localhost:5300
|
||||||
|
database: postgresqldb
|
||||||
|
database_alias: library_catalog
|
||||||
|
|
||||||
|
# Credentials
|
||||||
|
username: foo
|
||||||
|
|
||||||
|
schema_pattern:
|
||||||
|
allow:
|
||||||
|
- "^librarydb"
|
||||||
|
|
||||||
|
profile_pattern:
|
||||||
|
allow:
|
||||||
|
- "library_catalog.librarydb.*"
|
||||||
|
profiling:
|
||||||
|
enabled: True
|
||||||
|
|
||||||
|
sink:
|
||||||
|
type: file
|
||||||
|
config:
|
||||||
|
filename: "./trino_mces.json"
|
||||||
Loading…
x
Reference in New Issue
Block a user