mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-04 05:26:24 +00:00
Add bigquery and refactor others
This commit is contained in:
parent
cbbdf0930a
commit
b91d0cf63b
3
metadata-ingestion/CHANGELOG
Normal file
3
metadata-ingestion/CHANGELOG
Normal file
@ -0,0 +1,3 @@
|
||||
0.0.1
|
||||
-----
|
||||
* Modernizing python scripts and creating first package
|
@ -160,7 +160,7 @@ Extracts:
|
||||
- List of databases, schema, and tables
|
||||
- Column types associated with each table
|
||||
|
||||
Extra requirements: `pip install psycopg2-binary`
|
||||
Extra requirements: `pip install psycopg2-binary` or `pip install psycopg2`
|
||||
|
||||
```yml
|
||||
source:
|
||||
@ -190,6 +190,23 @@ source:
|
||||
# table_pattern is same as above
|
||||
```
|
||||
|
||||
## Google BigQuery `bigquery`
|
||||
Extracts:
|
||||
- List of databases, schema, and tables
|
||||
- Column types associated with each table
|
||||
|
||||
Extra requirements: `pip install pybigquery`
|
||||
|
||||
```yml
|
||||
source:
|
||||
type: snowflake
|
||||
config:
|
||||
project_id: project
|
||||
options:
|
||||
credential_path: "/path/to/keyfile.json"
|
||||
# table_pattern is same as above
|
||||
```
|
||||
|
||||
## File `file`
|
||||
Pulls metadata from a previously generated file. Note that the file sink
|
||||
can produce such files, and a number of samples are included in the
|
||||
@ -265,9 +282,10 @@ pytest tests/integration
|
||||
|
||||
## Sanity check code before checkin
|
||||
```sh
|
||||
flake8 src tests
|
||||
mypy -p gometa
|
||||
# Requries test_requirements.txt to have been installed.
|
||||
black --exclude 'gometa/metadata' -S -t py36 src tests
|
||||
isort src tests
|
||||
flake8 src tests
|
||||
mypy -p gometa
|
||||
pytest
|
||||
```
|
||||
|
@ -71,12 +71,13 @@ setuptools.setup(
|
||||
"toml>=0.10.0",
|
||||
"pydantic>=1.5.1",
|
||||
"requests>=2.25.1",
|
||||
"confluent_kafka[avro]>=1.5.0",
|
||||
"avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master",
|
||||
# Note: we currently require both Avro libraries. The codegen uses avro-python3
|
||||
# schema parsers at runtime for generating and reading JSON into Python objects.
|
||||
# At the same time, we use Kafka's AvroSerializer, which internally relies on
|
||||
# fastavro for serialization.
|
||||
# fastavro for serialization. We do not use confluent_kafka[avro], since it
|
||||
# is incompatible with its own dep on avro-python3.
|
||||
"confluent_kafka>=1.5.0",
|
||||
"fastavro>=1.3.0",
|
||||
"avro-python3>=1.8.2",
|
||||
"sqlalchemy>=1.3.23", # Required for SQL sources
|
||||
|
@ -2,13 +2,13 @@ from typing import Dict, Type
|
||||
|
||||
from gometa.ingestion.api.source import Source
|
||||
|
||||
from .kafka import KafkaSource
|
||||
|
||||
# from .ldap import LDAPSource
|
||||
from .bigquery import BigQuerySource
|
||||
from .hive import HiveSource
|
||||
from .kafka import KafkaSource
|
||||
from .mce_file import MetadataFileSource
|
||||
from .mssql import SQLServerSource
|
||||
from .mysql import MySQLSource
|
||||
from .hive import HiveSource
|
||||
from .postgres import PostgresSource
|
||||
from .snowflake import SnowflakeSource
|
||||
|
||||
@ -18,6 +18,7 @@ source_class_mapping: Dict[str, Type[Source]] = {
|
||||
"hive": HiveSource,
|
||||
"postgres": PostgresSource,
|
||||
"snowflake": SnowflakeSource,
|
||||
"bigquery": BigQuerySource,
|
||||
"kafka": KafkaSource,
|
||||
# "ldap": LDAPSource,
|
||||
"file": MetadataFileSource,
|
||||
|
21
metadata-ingestion/src/gometa/ingestion/source/bigquery.py
Normal file
21
metadata-ingestion/src/gometa/ingestion/source/bigquery.py
Normal file
@ -0,0 +1,21 @@
|
||||
from typing import Optional
|
||||
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class BigQueryConfig(SQLAlchemyConfig):
|
||||
scheme = "bigquery"
|
||||
project_id: Optional[str]
|
||||
|
||||
def get_sql_alchemy_url(self):
|
||||
return f"{self.scheme}://{self.project_id}"
|
||||
|
||||
|
||||
class BigQuerySource(SQLAlchemySource):
|
||||
def __init__(self, config, ctx):
|
||||
super().__init__(config, ctx, "bigquery")
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict, ctx):
|
||||
config = BigQueryConfig.parse_obj(config_dict)
|
||||
return cls(config, ctx)
|
@ -1,7 +1,7 @@
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class HiveConfig(SQLAlchemyConfig):
|
||||
class HiveConfig(BasicSQLAlchemyConfig):
|
||||
# defaults
|
||||
scheme = "hive"
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class SQLServerConfig(SQLAlchemyConfig):
|
||||
class SQLServerConfig(BasicSQLAlchemyConfig):
|
||||
# defaults
|
||||
host_port = "localhost:1433"
|
||||
scheme = "mssql+pytds"
|
||||
|
@ -1,7 +1,7 @@
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class MySQLConfig(SQLAlchemyConfig):
|
||||
class MySQLConfig(BasicSQLAlchemyConfig):
|
||||
# defaults
|
||||
host_port = "localhost:3306"
|
||||
scheme = "mysql+pymysql"
|
||||
|
@ -1,7 +1,7 @@
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class PostgresConfig(SQLAlchemyConfig):
|
||||
class PostgresConfig(BasicSQLAlchemyConfig):
|
||||
# defaults
|
||||
scheme = "postgresql+psycopg2"
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||
|
||||
|
||||
class SnowflakeConfig(SQLAlchemyConfig):
|
||||
class SnowflakeConfig(BasicSQLAlchemyConfig):
|
||||
# defaults
|
||||
scheme = "snowflake"
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@ -48,17 +49,23 @@ class SQLSourceReport(SourceReport):
|
||||
|
||||
|
||||
class SQLAlchemyConfig(BaseModel):
|
||||
options: Optional[dict] = {}
|
||||
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||
|
||||
@abstractmethod
|
||||
def get_sql_alchemy_url(self):
|
||||
pass
|
||||
|
||||
|
||||
class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
||||
username: str
|
||||
password: str
|
||||
host_port: str
|
||||
database: str = ""
|
||||
scheme: str
|
||||
options: Optional[dict] = {}
|
||||
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||
|
||||
def get_sql_alchemy_url(self):
|
||||
url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}"
|
||||
logger.debug("sql_alchemy_url={url}")
|
||||
return url
|
||||
|
||||
|
||||
@ -145,6 +152,7 @@ class SQLAlchemySource(Source):
|
||||
sql_config = self.config
|
||||
platform = self.platform
|
||||
url = sql_config.get_sql_alchemy_url()
|
||||
logger.debug(f"sql_alchemy_url={url}")
|
||||
engine = create_engine(url, **sql_config.options)
|
||||
inspector = reflection.Inspector.from_engine(engine)
|
||||
database = sql_config.database
|
||||
|
Loading…
x
Reference in New Issue
Block a user