From 03e3d49445786d7c13e2ece4f85a8a0fced2fbb3 Mon Sep 17 00:00:00 2001 From: Mars Lan Date: Wed, 12 Aug 2020 08:51:39 -0700 Subject: [PATCH] feat(ingest): add example crawler for MS SQL (#1803) Also fix the incorrect assumption on column comments & add sample docker-compose file --- docs/features.md | 2 +- metadata-ingestion/sql-etl/common.py | 6 +----- metadata-ingestion/sql-etl/mssql.yml | 12 ++++++++++++ metadata-ingestion/sql-etl/mssql_etl.py | 8 ++++++++ metadata-ingestion/sql-etl/mssql_etl.txt | 1 + 5 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 metadata-ingestion/sql-etl/mssql.yml create mode 100644 metadata-ingestion/sql-etl/mssql_etl.py create mode 100644 metadata-ingestion/sql-etl/mssql_etl.txt diff --git a/docs/features.md b/docs/features.md index 5f3b235a4a..b5b0a50ff8 100644 --- a/docs/features.md +++ b/docs/features.md @@ -54,6 +54,6 @@ You can integrate any data platform to DataHub easily. As long as you have a way We have provided example [ETL ingestion](architecture/metadata-ingestion.md) scripts for: - Hive - Kafka - - RDBMS (MySQL, Oracle, Postgres etc) + - RDBMS (MySQL, Oracle, Postgres, MS SQL etc) - Data warehouse (Snowflake, BigQuery etc) - LDAP diff --git a/metadata-ingestion/sql-etl/common.py b/metadata-ingestion/sql-etl/common.py index 43d8bd1f6b..d26c099dea 100644 --- a/metadata-ingestion/sql-etl/common.py +++ b/metadata-ingestion/sql-etl/common.py @@ -8,10 +8,6 @@ from sqlalchemy import create_engine from sqlalchemy import types from sqlalchemy.engine import reflection -URL = 'mysql+pymysql://datahub:datahub@localhost:3306' # e.g. mysql+pymysql://username:password@hostname:port -OPTIONS = {} # e.g. {"encoding": "latin1"} - - @dataclass class KafkaConfig: avsc_path = '../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc' @@ -57,7 +53,7 @@ def build_dataset_mce(platform, dataset_name, columns): "fieldPath": column["name"], "nativeDataType": repr(column["type"]), "type": { "type":get_column_type(column["type"]) }, - "description": column["comment"] + "description": column.get("comment", None) }) schema_metadata = { diff --git a/metadata-ingestion/sql-etl/mssql.yml b/metadata-ingestion/sql-etl/mssql.yml new file mode 100644 index 0000000000..6761fdbe73 --- /dev/null +++ b/metadata-ingestion/sql-etl/mssql.yml @@ -0,0 +1,12 @@ +version: '3.1' + +services: + + postgres: + image: mcr.microsoft.com/mssql/server + restart: always + environment: + ACCEPT_EULA: Y + SA_PASSWORD: DatahubR0cks + ports: + - "1433:1433" \ No newline at end of file diff --git a/metadata-ingestion/sql-etl/mssql_etl.py b/metadata-ingestion/sql-etl/mssql_etl.py new file mode 100644 index 0000000000..4426e9a5c9 --- /dev/null +++ b/metadata-ingestion/sql-etl/mssql_etl.py @@ -0,0 +1,8 @@ +from common import run + +# See https://github.com/m32/sqlalchemy-tds for more details +URL = '' # e.g. mssql+pytds://username:password@hostname:port +OPTIONS = {} +PLATFORM = 'mssql' + +run(URL, OPTIONS, PLATFORM) \ No newline at end of file diff --git a/metadata-ingestion/sql-etl/mssql_etl.txt b/metadata-ingestion/sql-etl/mssql_etl.txt new file mode 100644 index 0000000000..29eb6c4d19 --- /dev/null +++ b/metadata-ingestion/sql-etl/mssql_etl.txt @@ -0,0 +1 @@ +sqlalchemy-pytds==0.3 \ No newline at end of file