feat(ingest): add trino source to datahub (#3307)

This commit is contained in:
mayurinehate 2021-10-07 00:27:06 +05:30 committed by GitHub
parent 3d8e22ea34
commit d22d1c2795
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1641 additions and 1 deletions

View File

@ -62,6 +62,7 @@ Sources:
| [sql-profiles](./source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems |
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source |
Sinks

View File

@ -117,6 +117,13 @@ plugins: Dict[str, Set[str]] = {
"sqlalchemy": sql_common,
"sql-profiles": sql_common | {"great-expectations"},
"superset": {"requests"},
"trino": sql_common
| {
# SQLAlchemy support is coming up in trino python client
# subject to PR merging - https://github.com/trinodb/trino-python-client/pull/81.
# PR is from same author as that of sqlalchemy-trino library below.
"sqlalchemy-trino"
},
}
all_exclude_plugins: Set[str] = {
@ -187,8 +194,10 @@ base_dev_requirements = {
if is_py37_or_newer:
# The lookml plugin only works on Python 3.7 or newer.
# The trino plugin only works on Python 3.7 or newer.
# The trino plugin can be supported on Python 3.6 with minimal changes to opensource sqlalchemy-trino sourcecode.
base_dev_requirements = base_dev_requirements.union(
{dependency for plugin in ["lookml"] for dependency in plugins[plugin]}
{dependency for plugin in ["lookml", "trino"] for dependency in plugins[plugin]}
)
dev_requirements = {
@ -255,6 +264,7 @@ entry_points = {
"snowflake = datahub.ingestion.source.sql.snowflake:SnowflakeSource",
"snowflake-usage = datahub.ingestion.source.usage.snowflake_usage:SnowflakeUsageSource",
"superset = datahub.ingestion.source.superset:SupersetSource",
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",

View File

@ -42,6 +42,7 @@ Supported SQL sources:
- [Redshift](./redshift.md)
- [Snowflake](./snowflake.md)
- [Generic SQLAlchemy source](./sqlalchemy.md)
- [Trino](./trino.md)
## Quickstart recipe

View File

@ -0,0 +1,74 @@
# Trino
For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
## Setup
To install this plugin, run `pip install 'acryl-datahub[trino]'`.
Note! This plugin uses a package that requires Python 3.7+!
## Capabilities
This plugin extracts the following:
- Metadata for databases, schemas, and tables
- Column types and schema associated with each table
- Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md)
## Quickstart recipe
Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
```yml
source:
type: trino
config:
# Coordinates
host_port: localhost:5300
database: dbname
# Credentials
username: foo
password: datahub
sink:
# sink configs
```
## Config details
Note that a `.` is used to denote nested fields in the YAML recipe.
As a SQL-based service, the Athena integration is also supported by our SQL profiler. See [here](./sql_profiles.md) for more details on configuration.
| Field | Required | Default | Description |
| --------------------------- | -------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `username` | ✅ | | Trino username. |
| `password` | | | Trino password. |
| `host_port` | ✅ | `"localhost:3306"` | Trino host URL. |
| `database` | ✅ | | Trino database (catalog). |
| `database_alias` | | | Alias to apply to database when ingesting. |
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
| `options.<option>` | | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. |
| `table_pattern.deny` | | | List of regex patterns for tables to exclude from ingestion. |
| `table_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
| `schema_pattern.allow` | | | List of regex patterns for schemas to include in ingestion. |
| `schema_pattern.deny` | | | List of regex patterns for schemas to exclude from ingestion. |
| `schema_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
| `view_pattern.allow` | | | List of regex patterns for views to include in ingestion. |
| `view_pattern.deny` | | | List of regex patterns for views to exclude from ingestion. |
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
| `include_tables` | | `True` | Whether tables should be ingested. |
| `include_views` | | `True` | Whether views should be ingested. |
## Compatibility
Coming soon!
## Questions
If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!

View File

@ -0,0 +1,114 @@
import sys
from textwrap import dedent
from trino.exceptions import TrinoQueryError # noqa
from datahub.ingestion.source.sql.sql_common import (
BasicSQLAlchemyConfig,
SQLAlchemySource,
)
if sys.version_info >= (3, 7):
# This import verifies that the dependencies are available.
import sqlalchemy_trino # noqa: F401
from sqlalchemy import exc, sql
from sqlalchemy.engine import reflection
from sqlalchemy_trino import datatype, error
from sqlalchemy_trino.dialect import TrinoDialect
# Read only table names and skip view names, as view names will also be returned
# from get_view_names
@reflection.cache # type: ignore
def get_table_names(self, connection, schema: str = None, **kw): # type: ignore
schema = schema or self._get_default_schema_name(connection)
if schema is None:
raise exc.NoSuchTableError("schema is required")
query = dedent(
"""
SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = :schema and "table_type" != 'VIEW'
"""
).strip()
res = connection.execute(sql.text(query), schema=schema)
return [row.table_name for row in res]
# Include all table properties instead of only "comment" property
@reflection.cache # type: ignore
def get_table_comment(self, connection, table_name: str, schema: str = None, **kw): # type: ignore
try:
properties_table = self._get_full_table(f"{table_name}$properties", schema)
query = f"SELECT * FROM {properties_table}"
row = connection.execute(sql.text(query)).fetchone()
# Generate properties dictionary.
properties = {}
for col_name, col_value in row.items():
properties[col_name] = col_value
return {"text": properties.get("comment", None), "properties": properties}
except TrinoQueryError as e:
if e.error_name in (error.TABLE_NOT_FOUND):
return dict(text=None)
raise
# Include column comment
@reflection.cache # type: ignore
def _get_columns(self, connection, table_name, schema: str = None, **kw): # type: ignore
schema = schema or self._get_default_schema_name(connection)
query = dedent(
"""
SELECT
"column_name",
"data_type",
"column_default",
UPPER("is_nullable") AS "is_nullable",
"comment"
FROM "information_schema"."columns"
WHERE "table_schema" = :schema
AND "table_name" = :table
ORDER BY "ordinal_position" ASC
"""
).strip()
res = connection.execute(sql.text(query), schema=schema, table=table_name)
columns = []
for record in res:
column = dict(
name=record.column_name,
type=datatype.parse_sqltype(record.data_type),
nullable=record.is_nullable == "YES",
default=record.column_default,
comment=record.comment,
)
columns.append(column)
return columns
TrinoDialect.get_table_names = get_table_names
TrinoDialect.get_table_comment = get_table_comment
TrinoDialect._get_columns = _get_columns
else:
raise ModuleNotFoundError("The trino plugin requires Python 3.7 or newer.")
class TrinoConfig(BasicSQLAlchemyConfig):
# defaults
scheme = "trino"
def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str:
regular = f"{schema}.{table}"
if self.database_alias:
return f"{self.database_alias}.{regular}"
if self.database:
return f"{self.database}.{regular}"
return regular
class TrinoSource(SQLAlchemySource):
def __init__(self, config, ctx):
super().__init__(config, ctx, "trino")
@classmethod
def create(cls, config_dict, ctx):
config = TrinoConfig.parse_obj(config_dict)
return cls(config, ctx)

View File

@ -0,0 +1,78 @@
# Adapted from https://github.com/big-data-europe/docker-hive.
version: "3"
services:
testtrino:
image: trinodb/trino
container_name: "testtrino"
ports:
- 5300:8080
volumes:
- ./setup/etc:/etc/trino
depends_on:
- "trinodb_postgres"
- "hive-metastore"
trinodb_postgres:
image: postgres:alpine
container_name: "trinodb_postgres"
environment:
POSTGRES_PASSWORD: datahub
volumes:
- ./setup/setup.sql:/docker-entrypoint-initdb.d/postgres_setup.sql
ports:
- "5432:5432"
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8
volumes:
- namenode:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- ./setup/hadoop-hive.env
ports:
- "50070:50070"
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
volumes:
- datanode:/hadoop/dfs/data
env_file:
- ./setup/hadoop-hive.env
environment:
SERVICE_PRECONDITION: "namenode:50070"
ports:
- "50075:50075"
hive-server:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: "testhiveserver2"
env_file:
- ./setup/hadoop-hive.env
environment:
HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore"
SERVICE_PRECONDITION: "hive-metastore:9083"
ports:
- "10000:10000"
volumes:
- ./setup/hive_setup.sql:/hive_setup.sql
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: "hive-metastore"
env_file:
- ./setup/hadoop-hive.env
command: /opt/hive/bin/hive --service metastore
environment:
SERVICE_PRECONDITION: "namenode:50070 datanode:50075 hive-metastore-postgresql:5432"
ports:
- "9083:9083"
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql:2.3.0
# presto-coordinator:
# image: shawnzhu/prestodb:0.181
# ports:
# - "8080:8080"
volumes:
namenode:
datanode:

View File

@ -0,0 +1,4 @@
connector.name=hive
hive.metastore.uri=thrift://hive-metastore:9083
hive.translate-hive-views=true
#hive.legacy-hive-view-translation=true

View File

@ -0,0 +1,4 @@
connector.name=postgresql
connection-url=jdbc:postgresql://trinodb_postgres:5432/postgres
connection-user=postgres
connection-password=datahub

View File

@ -0,0 +1,7 @@
coordinator=true
node-scheduler.include-coordinator=true
http-server.http.port=8080
query.max-memory=5GB
query.max-memory-per-node=1GB
query.max-total-memory-per-node=2GB
discovery.uri=http://localhost:8080

View File

@ -0,0 +1,14 @@
-server
-Xmx16G
-XX:-UseBiasedLocking
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+ExplicitGCInvokesConcurrent
-XX:+ExitOnOutOfMemoryError
-XX:+HeapDumpOnOutOfMemoryError
-XX:-OmitStackTraceInFastThrow
-XX:ReservedCodeCacheSize=512M
-XX:PerMethodRecompilationCutoff=10000
-XX:PerBytecodeRecompilationCutoff=10000
-Djdk.attach.allowAttachSelf=true
-Djdk.nio.maxCachedBufferSize=2000000

View File

@ -0,0 +1 @@
io.trino=INFO

View File

@ -0,0 +1,2 @@
node.environment=production
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff

View File

@ -0,0 +1,30 @@
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031

View File

@ -0,0 +1,46 @@
CREATE DATABASE IF NOT EXISTS db1;
CREATE DATABASE IF NOT EXISTS db2;
-- Setup a "pokes" example table.
CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING) PARTITIONED BY (baz STRING);
LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes PARTITION (baz='dummy');
CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING);
LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db2.pokes;
-- Setup a table with a special character.
CREATE TABLE IF NOT EXISTS db1.`_test_table_underscore` (foo INT, bar STRING);
-- Create tables with struct and array types.
-- From https://stackoverflow.com/questions/57491644/correct-usage-of-a-struct-in-hive.
CREATE TABLE IF NOT EXISTS db1.struct_test
(
property_id INT,
service STRUCT<
type: STRING
,provider: ARRAY<INT>
>
);
CREATE TABLE IF NOT EXISTS db1.array_struct_test
(
property_id INT COMMENT 'id of property',
service array<STRUCT<
type: STRING
,provider: ARRAY<INT>
>> COMMENT 'service types and providers'
) TBLPROPERTIES ('comment' = 'This table has array of structs', 'another.comment' = 'This table has no partitions');;
WITH
test_data as (
SELECT 989 property_id, array(NAMED_STRUCT('type','Cleaning','provider', ARRAY(587, 887)),
NAMED_STRUCT('type','Pricing','provider', ARRAY(932))
) AS service
--, array(4,5,5) AS ratings
)
INSERT INTO TABLE db1.array_struct_test
select * from test_data;
CREATE MATERIALIZED VIEW db1.struct_test_view_materialized as select * from db1.struct_test;
CREATE VIEW db1.array_struct_test_view as select * from db1.array_struct_test;

View File

@ -0,0 +1,44 @@
CREATE SCHEMA librarydb;
CREATE TABLE librarydb.book (
id INTEGER NOT NULL,
name VARCHAR ( 50 ) NOT NULL,
author VARCHAR ( 50 ),
publisher VARCHAR (50),
PRIMARY KEY (id)
);
CREATE TABLE librarydb.member (
id INTEGER NOT NULL,
name VARCHAR ( 50 ) NOT NULL,
PRIMARY KEY (id)
);
CREATE TABLE librarydb.issue_history (
book_id INTEGER NOT NULL,
member_id INTEGER NOT NULL,
issue_date DATE,
return_date DATE,
CONSTRAINT fk_book FOREIGN KEY(book_id) REFERENCES librarydb.book(id),
CONSTRAINT fk_member FOREIGN KEY(member_id) REFERENCES librarydb.member(id)
);
INSERT INTO librarydb.book (id, name, author) VALUES (1, 'Book 1', 'ABC');
INSERT INTO librarydb.book (id, name, author) VALUES (2, 'Book 2', 'PQR');
INSERT INTO librarydb.book (id, name, author) VALUES (3, 'Book 3', 'XYZ');
INSERT INTO librarydb.member(id, name) VALUES (1, 'Member 1');
INSERT INTO librarydb.member(id, name) VALUES (2, 'Member 2');
INSERT INTO librarydb.issue_history VALUES (1, 1, TO_DATE('2021-09-27','YYYY-MM-DD'), TO_DATE('2021-09-27','YYYY-MM-DD'));
INSERT INTO librarydb.issue_history VALUES (2, 2, TO_DATE('2021-09-27','YYYY-MM-DD'), NULL);
CREATE VIEW librarydb.book_in_circulation as
SELECT b.id, b.name, b.author, b.publisher, i.member_id, i.issue_date FROM
librarydb.book b
JOIN
librarydb.issue_history i
on b.id=i.book_id
where i.return_date is null;

View File

@ -0,0 +1,82 @@
import subprocess
import sys
import pytest
import requests
from click.testing import CliRunner
from freezegun import freeze_time
from datahub.entrypoints import datahub
from tests.test_helpers import fs_helpers, mce_helpers
from tests.test_helpers.click_helpers import assert_result_ok
from tests.test_helpers.docker_helpers import wait_for_port
FROZEN_TIME = "2021-09-23 12:00:00"
@freeze_time(FROZEN_TIME)
@pytest.mark.skipif(sys.version_info < (3, 7), reason="trino requires Python 3.7+")
@pytest.mark.integration
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"
with docker_compose_runner(
test_resources_dir / "docker-compose.yml", "trino"
) as docker_services:
wait_for_port(docker_services, "testtrino", 8080)
wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)
docker_services.wait_until_responsive(
timeout=30,
pause=1,
check=lambda: requests.get("http://localhost:5300/v1/info").json()[
"starting"
]
is False,
)
# Set up the hive db
command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
subprocess.run(command, shell=True, check=True)
# Run the metadata ingestion pipeline.
runner = CliRunner()
with fs_helpers.isolated_filesystem(tmp_path):
print(tmp_path)
# Run the metadata ingestion pipeline for trino catalog referring to postgres database
config_file = (test_resources_dir / "trino_to_file.yml").resolve()
result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
assert_result_ok(result)
# Verify the output.
mce_helpers.check_golden_file(
pytestconfig,
output_path="trino_mces.json",
golden_path=test_resources_dir / "trino_mces_golden.json",
)
# Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
# This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070
# Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
# Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html
# Run the metadata ingestion pipeline for trino catalog referring to hive database
config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve()
result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
assert_result_ok(result)
# Verify the output.
mce_helpers.check_golden_file(
pytestconfig,
output_path="trino_hive_mces.json",
golden_path=test_resources_dir / "trino_hive_mces_golden.json",
ignore_paths=[
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
],
)
# Limitation 3 - Limited DatasetProperties available in Trino than in direct hive source - https://trino.io/docs/current/connector/hive.html#table-properties.

View File

@ -0,0 +1,583 @@
[
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
"comment": "This table has array of structs",
"another.comment": "This table has no partitions",
"numfiles": "4",
"numrows": "1",
"rawdatasize": "32",
"totalsize": "138",
"transient_lastddltime": "1633434492"
},
"externalUrl": null,
"description": "This table has array of structs",
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.array_struct_test",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "property_id",
"jsonPath": null,
"nullable": true,
"description": "id of property",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "service",
"jsonPath": null,
"nullable": true,
"description": "service types and providers",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.ArrayType": {
"nestedType": null
}
}
},
"nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1633435441"
},
"externalUrl": null,
"description": null,
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.pokes",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "foo",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "bar",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "baz",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
"numfiles": "0",
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1633434486"
},
"externalUrl": null,
"description": null,
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.struct_test",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "property_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "service",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NullType": {}
}
},
"nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"numfiles": "0",
"totalsize": "0",
"transient_lastddltime": "1633434491"
},
"externalUrl": null,
"description": null,
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.struct_test_view_materialized",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "property_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "service",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NullType": {}
}
},
"nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"column_stats_accurate": "{\"BASIC_STATS\":\"true\"}",
"numfiles": "0",
"numrows": "0",
"rawdatasize": "0",
"totalsize": "0",
"transient_lastddltime": "1633434486"
},
"externalUrl": null,
"description": null,
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1._test_table_underscore",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "foo",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "bar",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"transient_lastddltime": "1633434492",
"view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"",
"is_view": "True"
},
"externalUrl": null,
"description": null,
"uri": null,
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "hivedb.db1.array_struct_test_view",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "property_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "service",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.ArrayType": {
"nestedType": null
}
}
},
"nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-hive-test",
"properties": null
}
}
]

View File

@ -0,0 +1,20 @@
run_id: trino-hive-test
source:
type: trino
config:
# Coordinates
host_port: localhost:5300
database: hivedb
# Credentials
username: foo
schema_pattern:
allow:
- "^db1"
sink:
type: file
config:
filename: "./trino_hive_mces.json"

View File

@ -0,0 +1,498 @@
[
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "library_catalog.librarydb.book",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "name",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "author",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "publisher",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "library_catalog.librarydb.issue_history",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "book_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "member_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "issue_date",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.DateType": {}
}
},
"nativeDataType": "DATE()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "return_date",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.DateType": {}
}
},
"nativeDataType": "DATE()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "library_catalog.librarydb.member",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "name",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-test",
"properties": null
}
},
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "library_catalog.librarydb.book_in_circulation",
"platform": "urn:li:dataPlatform:trino",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown",
"impersonator": null
},
"deleted": null,
"dataset": null,
"cluster": null,
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "name",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "author",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "publisher",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "member_id",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
},
{
"fieldPath": "issue_date",
"jsonPath": null,
"nullable": true,
"description": null,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.DateType": {}
}
},
"nativeDataType": "DATE()",
"recursive": false,
"globalTags": null,
"glossaryTerms": null,
"isPartOfKey": false
}
],
"primaryKeys": null,
"foreignKeysSpecs": null,
"foreignKeys": null
}
}
]
}
},
"proposedDelta": null,
"systemMetadata": {
"lastObserved": 1632398400000,
"runId": "trino-test",
"properties": null
}
},
{
"auditHeader": null,
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)",
"entityKeyAspect": null,
"changeType": "UPSERT",
"aspectName": "datasetProfile",
"aspect": {
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 3, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}",
"contentType": "application/json"
},
"systemMetadata": null
},
{
"auditHeader": null,
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)",
"entityKeyAspect": null,
"changeType": "UPSERT",
"aspectName": "datasetProfile",
"aspect": {
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
"contentType": "application/json"
},
"systemMetadata": null
},
{
"auditHeader": null,
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)",
"entityKeyAspect": null,
"changeType": "UPSERT",
"aspectName": "datasetProfile",
"aspect": {
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}",
"contentType": "application/json"
},
"systemMetadata": null
},
{
"auditHeader": null,
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)",
"entityKeyAspect": null,
"changeType": "UPSERT",
"aspectName": "datasetProfile",
"aspect": {
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
"contentType": "application/json"
},
"systemMetadata": null
}
]

View File

@ -0,0 +1,27 @@
run_id: trino-test
source:
type: trino
config:
# Coordinates
host_port: localhost:5300
database: postgresqldb
database_alias: library_catalog
# Credentials
username: foo
schema_pattern:
allow:
- "^librarydb"
profile_pattern:
allow:
- "library_catalog.librarydb.*"
profiling:
enabled: True
sink:
type: file
config:
filename: "./trino_mces.json"