diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index fcca0f80ad..b9b3ec56d8 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -41,8 +41,9 @@ We use a plugin architecture so that you can install only the dependencies you a | hive | `pip install 'acryl-datahub[hive]'` | Hive source | | mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source | | mysql | `pip install 'acryl-datahub[mysql]'` | MySQL source | -| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source | | oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source | +| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source | +| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | | snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | | mongodb | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | | ldap | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | @@ -369,6 +370,28 @@ source: # options is same as above ``` +### Other databases using SQLAlchemy `sqlalchemy` + +The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen +database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) +defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. + +Extracts: + +- List of schemas and tables +- Column types associated with each table + +```yml +source: + type: sqlalchemy + config: + # See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls + connect_uri: "dialect+driver://username:password@host:port/database" + options: {} # same as above + schema_pattern: {} # same as above + table_pattern: {} # same as above +``` + ### MongoDB `mongodb` Extracts: diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index db2b077af7..12f7cebd26 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -161,6 +161,7 @@ setuptools.setup( "console_scripts": ["datahub = datahub.entrypoints:datahub"], "datahub.ingestion.source.plugins": [ "file = datahub.ingestion.source.mce_file:MetadataFileSource", + "sqlalchemy = datahub.ingestion.source.sql_generic:SQLAlchemyGenericSource", "athena = datahub.ingestion.source.athena:AthenaSource", "bigquery = datahub.ingestion.source.bigquery:BigQuerySource", "dbt = datahub.ingestion.source.dbt:DBTSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql_generic.py new file mode 100644 index 0000000000..9e48513150 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_generic.py @@ -0,0 +1,20 @@ +from datahub.ingestion.api.common import PipelineContext +from .sql_common import SQLAlchemyConfig, SQLAlchemySource + + +class SQLAlchemyGenericConfig(SQLAlchemyConfig): + platform: str + connect_uri: str + + def get_sql_alchemy_url(self): + return self.connect_uri + + +class SQLAlchemyGenericSource(SQLAlchemySource): + def __init__(self, config: SQLAlchemyGenericConfig, ctx: PipelineContext): + super().__init__(config, ctx, config.platform) + + @classmethod + def create(cls, config_dict, ctx): + config = SQLAlchemyGenericConfig.parse_obj(config_dict) + return cls(config, ctx)