mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-05 07:12:27 +00:00
feat(ingest): add aws athena ingestion source (#2213)
Co-authored-by: thomas.larsson <thomas.larsson@klarna.com>
This commit is contained in:
parent
94e485e4cb
commit
2b470c2d85
@ -295,6 +295,29 @@ source:
|
|||||||
# table_pattern/schema_pattern is same as above
|
# table_pattern/schema_pattern is same as above
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### AWS Athena `athena`
|
||||||
|
|
||||||
|
Extracts:
|
||||||
|
|
||||||
|
- List of databases and tables
|
||||||
|
- Column types associated with each table
|
||||||
|
|
||||||
|
Extra requirements: `pip install PyAthena[SQLAlchemy]`
|
||||||
|
|
||||||
|
```yml
|
||||||
|
source:
|
||||||
|
type: athena
|
||||||
|
config:
|
||||||
|
username: aws_access_key_id # Optional. If not specified, credentials are picked up according to boto3 rules
|
||||||
|
# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
||||||
|
password: aws_secret_access_key # Optional.
|
||||||
|
database: database # Optional, defaults to "default"
|
||||||
|
aws_region: aws_region_name # i.e. "eu-west-1"
|
||||||
|
s3_output_location: s3_location # "s3://<bucket-name>/prefix/"
|
||||||
|
work_group: athena_workgroup # "primary"
|
||||||
|
# table_pattern/schema_pattern is same as above
|
||||||
|
```
|
||||||
|
|
||||||
### LDAP `ldap`
|
### LDAP `ldap`
|
||||||
|
|
||||||
Extracts:
|
Extracts:
|
||||||
|
@ -5,4 +5,5 @@ psycopg2-binary # Driver for Postgres
|
|||||||
snowflake-sqlalchemy # Driver for Snowflake
|
snowflake-sqlalchemy # Driver for Snowflake
|
||||||
pybigquery # Driver for BigQuery
|
pybigquery # Driver for BigQuery
|
||||||
python-ldap>=2.4 # LDAP client library
|
python-ldap>=2.4 # LDAP client library
|
||||||
|
PyAthena[SQLAlchemy]<2.0.0 # Driver for Aws Athena
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ logger = logging.getLogger(__name__)
|
|||||||
# Set to debug on the root logger.
|
# Set to debug on the root logger.
|
||||||
logging.getLogger(None).setLevel(logging.DEBUG)
|
logging.getLogger(None).setLevel(logging.DEBUG)
|
||||||
logging.getLogger("urllib3").setLevel(logging.WARN)
|
logging.getLogger("urllib3").setLevel(logging.WARN)
|
||||||
|
logging.getLogger("botocore").setLevel(logging.INFO)
|
||||||
|
|
||||||
# Configure logger.
|
# Configure logger.
|
||||||
BASE_LOGGING_FORMAT = (
|
BASE_LOGGING_FORMAT = (
|
||||||
|
40
metadata-ingestion/src/datahub/ingestion/source/athena.py
Normal file
40
metadata-ingestion/src/datahub/ingestion/source/athena.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
|
class AthenaConfig(SQLAlchemyConfig):
|
||||||
|
scheme: str = "awsathena+rest"
|
||||||
|
username: Optional[str] = None
|
||||||
|
password: Optional[str] = None
|
||||||
|
database: Optional[str] = None
|
||||||
|
aws_region: str
|
||||||
|
s3_output_location: str
|
||||||
|
work_group: str
|
||||||
|
|
||||||
|
def get_sql_alchemy_url(self):
|
||||||
|
url = f"{self.scheme}://"
|
||||||
|
if self.username:
|
||||||
|
url += f"{quote_plus(self.username)}"
|
||||||
|
if self.password:
|
||||||
|
url += f":{quote_plus(self.password)}"
|
||||||
|
else:
|
||||||
|
url += ":"
|
||||||
|
url += f"@athena.{self.aws_region}.amazonaws.com:443/"
|
||||||
|
if self.database:
|
||||||
|
url += f"{self.database}"
|
||||||
|
url += f"?s3_staging_dir={quote_plus(self.s3_output_location)}"
|
||||||
|
url += f"&work_group={self.work_group}"
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
class AthenaSource(SQLAlchemySource):
|
||||||
|
def __init__(self, config, ctx):
|
||||||
|
super().__init__(config, ctx, "athena")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict, ctx):
|
||||||
|
config = AthenaConfig.parse_obj(config_dict)
|
||||||
|
return cls(config, ctx)
|
@ -1,6 +1,7 @@
|
|||||||
from datahub.ingestion.api.registry import Registry
|
from datahub.ingestion.api.registry import Registry
|
||||||
from datahub.ingestion.api.source import Source
|
from datahub.ingestion.api.source import Source
|
||||||
|
|
||||||
|
from .athena import AthenaSource
|
||||||
from .bigquery import BigQuerySource
|
from .bigquery import BigQuerySource
|
||||||
from .hive import HiveSource
|
from .hive import HiveSource
|
||||||
from .kafka import KafkaSource
|
from .kafka import KafkaSource
|
||||||
@ -21,6 +22,7 @@ source_registry.register("postgres", PostgresSource)
|
|||||||
source_registry.register("snowflake", SnowflakeSource)
|
source_registry.register("snowflake", SnowflakeSource)
|
||||||
source_registry.register("bigquery", BigQuerySource)
|
source_registry.register("bigquery", BigQuerySource)
|
||||||
source_registry.register("kafka", KafkaSource)
|
source_registry.register("kafka", KafkaSource)
|
||||||
|
source_registry.register("athena", AthenaSource)
|
||||||
|
|
||||||
# Attempt to enable the LDAP source. Because it has some imports that we don't
|
# Attempt to enable the LDAP source. Because it has some imports that we don't
|
||||||
# want to install by default, we instead use this approach.
|
# want to install by default, we instead use this approach.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user