feat(ingest): add generic sqlalchemy source (#2389)

This commit is contained in:
Harshal Sheth 2021-04-13 08:01:38 -07:00 committed by GitHub
parent 3ddf163a91
commit fb6f74b1da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 1 deletions

View File

@ -41,8 +41,9 @@ We use a plugin architecture so that you can install only the dependencies you a
| hive | `pip install 'acryl-datahub[hive]'` | Hive source |
| mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source |
| mysql | `pip install 'acryl-datahub[mysql]'` | MySQL source |
| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source |
| oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source |
| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source |
| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
| snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
| mongodb | `pip install 'acryl-datahub[mongodb]'` | MongoDB source |
| ldap | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source |
@ -369,6 +370,28 @@ source:
# options is same as above
```
### Other databases using SQLAlchemy `sqlalchemy`
The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen
database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/)
defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself.
Extracts:
- List of schemas and tables
- Column types associated with each table
```yml
source:
type: sqlalchemy
config:
# See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls
connect_uri: "dialect+driver://username:password@host:port/database"
options: {} # same as above
schema_pattern: {} # same as above
table_pattern: {} # same as above
```
### MongoDB `mongodb`
Extracts:

View File

@ -161,6 +161,7 @@ setuptools.setup(
"console_scripts": ["datahub = datahub.entrypoints:datahub"],
"datahub.ingestion.source.plugins": [
"file = datahub.ingestion.source.mce_file:MetadataFileSource",
"sqlalchemy = datahub.ingestion.source.sql_generic:SQLAlchemyGenericSource",
"athena = datahub.ingestion.source.athena:AthenaSource",
"bigquery = datahub.ingestion.source.bigquery:BigQuerySource",
"dbt = datahub.ingestion.source.dbt:DBTSource",

View File

@ -0,0 +1,20 @@
from datahub.ingestion.api.common import PipelineContext
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
class SQLAlchemyGenericConfig(SQLAlchemyConfig):
platform: str
connect_uri: str
def get_sql_alchemy_url(self):
return self.connect_uri
class SQLAlchemyGenericSource(SQLAlchemySource):
def __init__(self, config: SQLAlchemyGenericConfig, ctx: PipelineContext):
super().__init__(config, ctx, config.platform)
@classmethod
def create(cls, config_dict, ctx):
config = SQLAlchemyGenericConfig.parse_obj(config_dict)
return cls(config, ctx)