diff --git a/datahub-web-react/src/images/clickhouselogo.png b/datahub-web-react/src/images/clickhouselogo.png new file mode 100644 index 0000000000..89ee65329e Binary files /dev/null and b/datahub-web-react/src/images/clickhouselogo.png differ diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3bb01757a3..176e72b5fa 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -111,6 +111,8 @@ plugins: Dict[str, Set[str]] = { "azure-ad": set(), "bigquery": sql_common | bigquery_common | {"pybigquery >= 0.6.0"}, "bigquery-usage": bigquery_common | {"cachetools"}, + "clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"}, + "clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"}, "datahub-business-glossary": set(), "data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"}, "dbt": {"requests"}, @@ -216,6 +218,8 @@ base_dev_requirements = { for plugin in [ "bigquery", "bigquery-usage", + "clickhouse", + "clickhouse-usage", "elasticsearch", "looker", "glue", @@ -268,6 +272,7 @@ full_test_dev_requirements = { for plugin in [ # Only include Athena for Python 3.7 or newer. *(["athena"] if is_py37_or_newer else []), + "clickhouse", "druid", "feast", "hive", @@ -293,6 +298,8 @@ entry_points = { "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", "bigquery = datahub.ingestion.source.sql.bigquery:BigQuerySource", "bigquery-usage = datahub.ingestion.source.usage.bigquery_usage:BigQueryUsageSource", + "clickhouse = datahub.ingestion.source.sql.clickhouse:ClickHouseSource", + "clickhouse-usage = datahub.ingestion.source.usage.clickhouse_usage:ClickHouseUsageSource", "data-lake = datahub.ingestion.source.data_lake:DataLakeSource", "dbt = datahub.ingestion.source.dbt:DBTSource", "druid = datahub.ingestion.source.sql.druid:DruidSource", diff --git a/metadata-ingestion/source_docs/clickhouse.md b/metadata-ingestion/source_docs/clickhouse.md new file mode 100644 index 0000000000..fe3cd8429d --- /dev/null +++ b/metadata-ingestion/source_docs/clickhouse.md @@ -0,0 +1,177 @@ +# ClickHouse + +For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[clickhouse]'`. + +## Capabilities + +This plugin extracts the following: + +- Metadata for tables, views, materialized views and dictionaries +- Column types associated with each table(except *AggregateFunction and DateTime with timezone) +- Table, row, and column statistics via optional [SQL profiling](./sql_profiles.md) +- Table, view, materialized view and dictionary(with CLICKHOUSE source_type) lineage + +:::tip + +You can also get fine-grained usage statistics for ClickHouse using the `clickhouse-usage` source described below. + +::: + +## Quickstart recipe + +Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. + +For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes). + +```yml +source: + type: clickhouse + config: + # Coordinates + host_port: localhost:9000 + + # Credentials + username: user + password: pass + + # Options + platform_instance: DatabaseNameToBeIngested + + include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True + +sink: + # sink configs +``` + +
+ Extra options to use encryption connection or different interface + +For the HTTP interface: +```yml +source: + type: clickhouse + config: + host_port: localhost:8443 + protocol: https + +``` + +For the Native interface: +```yml +source: + type: clickhouse + config: + host_port: localhost:9440 + scheme: clickhouse+native + secure: True +``` + +
+ +## Config details + +Like all SQL-based sources, the ClickHouse integration supports: +- Stale Metadata Deletion: See [here](./stateful_ingestion.md) for more details on configuration. +- SQL Profiling: See [here](./sql_profiles.md) for more details on configuration. + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|-----------------------------|----------|----------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `username` | | | ClickHouse username. | +| `password` | | | ClickHouse password. | +| `host_port` | ✅ | | ClickHouse host URL. | +| `database` | | | ClickHouse database to connect. | +| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | +| `platform_instance` | | None | The Platform instance to use while constructing URNs. | +| `options.