From c2065bd7fe2767b10fd6300dfcdef37352d5762e Mon Sep 17 00:00:00 2001 From: Alexander Chashnikov <6350825+ne1r0n@users.noreply.github.com> Date: Mon, 21 Feb 2022 17:36:08 +0200 Subject: [PATCH] feat(ingest): clickhouse - add initial support (#4057) Co-authored-by: Shirshanka Das --- .../src/images/clickhouselogo.png | Bin 0 -> 27285 bytes metadata-ingestion/setup.py | 7 + metadata-ingestion/source_docs/clickhouse.md | 177 ++ .../ingestion/source/sql/clickhouse.py | 624 +++++ .../ingestion/source/sql/sql_common.py | 2 + .../source/usage/clickhouse_usage.py | 226 ++ .../clickhouse/clickhouse_mces_golden.json | 2135 +++++++++++++++++ .../clickhouse/clickhouse_to_file.yml | 32 + .../integration/clickhouse/docker-compose.yml | 14 + .../integration/clickhouse/setup/setup.sql | 144 ++ .../integration/clickhouse/test_clickhouse.py | 32 + .../tests/unit/test_clickhouse_source.py | 55 + .../main/resources/boot/data_platforms.json | 10 + 13 files changed, 3458 insertions(+) create mode 100644 datahub-web-react/src/images/clickhouselogo.png create mode 100644 metadata-ingestion/source_docs/clickhouse.md create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py create mode 100644 metadata-ingestion/tests/integration/clickhouse/clickhouse_mces_golden.json create mode 100644 metadata-ingestion/tests/integration/clickhouse/clickhouse_to_file.yml create mode 100644 metadata-ingestion/tests/integration/clickhouse/docker-compose.yml create mode 100644 metadata-ingestion/tests/integration/clickhouse/setup/setup.sql create mode 100644 metadata-ingestion/tests/integration/clickhouse/test_clickhouse.py create mode 100644 metadata-ingestion/tests/unit/test_clickhouse_source.py diff --git a/datahub-web-react/src/images/clickhouselogo.png b/datahub-web-react/src/images/clickhouselogo.png new file mode 100644 index 0000000000000000000000000000000000000000..89ee65329ea90d306477826f9200fb5dfb4290d5 GIT binary patch literal 27285 zcmeHQ?@Lor7(UL;=4x}=3>1qyk|YJ21)Gre%_J=keVA0Wq-o48(V1uzQ~SJm)#*J?}ZU zCL`S_mxam*A@bA|Llzc3E@bpL4WX6!{l)0Wl^Te&=qeMC$u?q zAyQF5bG~NgnmXc;?Du2q!#{CTb#d~}(57$IFN{}-c7DiI*tGaM-dyR= zqB@Ut1m|vd%Gw$%67s98+8DrJ27TFs#bqoe;nGh+oTBXUVQ*@=i@q%8^vZQC);Tx@ zi|t~47>kL&BUXrm@B=GY03xu6QUO8}x(9R>3<~t7FdWJSSSHg1EEASVxd6*#n!xB9 z6A;P;%+Ul*z%m&WuuNDc;}HAom}-qW!1Al*6j63?vxy2iVAQ5 z4&VSd01kiy-~c!P4uAuzl>?FOKeVhCz;%~a>viJVFwX)9zyWXo8~_LYjRVR`TZXS% zCDBiBj&1RO#|7;(i#D3{z5DAUC0m=Xuarn2$&VzzOKD`S)NJPE?4j)?#BMpgWE~u| z%qh6BO+t}s+d$l|#qmI0@0Xpy!Z%Tq%Ai2mE<=J~)m(rAtL6*}7%B{vasjL6OcP$d zYR;MoKVV1@_yL0gegHq9T!0@iO~4OOcuctf%M>)>VaxnbFr_%jiS3*;`g!}SbK4A! zn$iBZx9(p*8Czkt#*1lyU)n3BQD25&|7S2hag@fODSN7!mU)hJ(0z6xv`_}T^@zyb*{73mtXKIcok&=G|J=faiO?|f4IPs z>lp#duy{tVINhIeju5OtFeKK>enW3qlm2NLQn(&CLi`qToMTZ}gnO8+bU)_= 0.6.0"}, "bigquery-usage": bigquery_common | {"cachetools"}, + "clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"}, + "clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"}, "datahub-business-glossary": set(), "data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"}, "dbt": {"requests"}, @@ -216,6 +218,8 @@ base_dev_requirements = { for plugin in [ "bigquery", "bigquery-usage", + "clickhouse", + "clickhouse-usage", "elasticsearch", "looker", "glue", @@ -268,6 +272,7 @@ full_test_dev_requirements = { for plugin in [ # Only include Athena for Python 3.7 or newer. *(["athena"] if is_py37_or_newer else []), + "clickhouse", "druid", "feast", "hive", @@ -293,6 +298,8 @@ entry_points = { "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", "bigquery = datahub.ingestion.source.sql.bigquery:BigQuerySource", "bigquery-usage = datahub.ingestion.source.usage.bigquery_usage:BigQueryUsageSource", + "clickhouse = datahub.ingestion.source.sql.clickhouse:ClickHouseSource", + "clickhouse-usage = datahub.ingestion.source.usage.clickhouse_usage:ClickHouseUsageSource", "data-lake = datahub.ingestion.source.data_lake:DataLakeSource", "dbt = datahub.ingestion.source.dbt:DBTSource", "druid = datahub.ingestion.source.sql.druid:DruidSource", diff --git a/metadata-ingestion/source_docs/clickhouse.md b/metadata-ingestion/source_docs/clickhouse.md new file mode 100644 index 0000000000..fe3cd8429d --- /dev/null +++ b/metadata-ingestion/source_docs/clickhouse.md @@ -0,0 +1,177 @@ +# ClickHouse + +For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[clickhouse]'`. + +## Capabilities + +This plugin extracts the following: + +- Metadata for tables, views, materialized views and dictionaries +- Column types associated with each table(except *AggregateFunction and DateTime with timezone) +- Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md) +- Table, view, materialized view and dictionary(with CLICKHOUSE source_type) lineage + +:::tip + +You can also get fine-grained usage statistics for ClickHouse using the `clickhouse-usage` source described below. + +::: + +## Quickstart recipe + +Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. + +For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes). + +```yml +source: + type: clickhouse + config: + # Coordinates + host_port: localhost:9000 + + # Credentials + username: user + password: pass + + # Options + platform_instance: DatabaseNameToBeIngested + + include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True + +sink: + # sink configs +``` + +
+ Extra options to use encryption connection or different interface + +For the HTTP interface: +```yml +source: + type: clickhouse + config: + host_port: localhost:8443 + protocol: https + +``` + +For the Native interface: +```yml +source: + type: clickhouse + config: + host_port: localhost:9440 + scheme: clickhouse+native + secure: True +``` + +
+ +## Config details + +Like all SQL-based sources, the ClickHouse integration supports: +- Stale Metadata Deletion: See [here](./stateful_ingestion.md) for more details on configuration. +- SQL Profiling: See [here](./sql_profiles.md) for more details on configuration. + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|-----------------------------|----------|----------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `username` | | | ClickHouse username. | +| `password` | | | ClickHouse password. | +| `host_port` | ✅ | | ClickHouse host URL. | +| `database` | | | ClickHouse database to connect. | +| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | +| `platform_instance` | | None | The Platform instance to use while constructing URNs. | +| `options.