mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-04 21:46:30 +00:00
Add bigquery and refactor others
This commit is contained in:
parent
cbbdf0930a
commit
b91d0cf63b
3
metadata-ingestion/CHANGELOG
Normal file
3
metadata-ingestion/CHANGELOG
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
0.0.1
|
||||||
|
-----
|
||||||
|
* Modernizing python scripts and creating first package
|
@ -160,7 +160,7 @@ Extracts:
|
|||||||
- List of databases, schema, and tables
|
- List of databases, schema, and tables
|
||||||
- Column types associated with each table
|
- Column types associated with each table
|
||||||
|
|
||||||
Extra requirements: `pip install psycopg2-binary`
|
Extra requirements: `pip install psycopg2-binary` or `pip install psycopg2`
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
source:
|
source:
|
||||||
@ -190,6 +190,23 @@ source:
|
|||||||
# table_pattern is same as above
|
# table_pattern is same as above
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Google BigQuery `bigquery`
|
||||||
|
Extracts:
|
||||||
|
- List of databases, schema, and tables
|
||||||
|
- Column types associated with each table
|
||||||
|
|
||||||
|
Extra requirements: `pip install pybigquery`
|
||||||
|
|
||||||
|
```yml
|
||||||
|
source:
|
||||||
|
type: snowflake
|
||||||
|
config:
|
||||||
|
project_id: project
|
||||||
|
options:
|
||||||
|
credential_path: "/path/to/keyfile.json"
|
||||||
|
# table_pattern is same as above
|
||||||
|
```
|
||||||
|
|
||||||
## File `file`
|
## File `file`
|
||||||
Pulls metadata from a previously generated file. Note that the file sink
|
Pulls metadata from a previously generated file. Note that the file sink
|
||||||
can produce such files, and a number of samples are included in the
|
can produce such files, and a number of samples are included in the
|
||||||
@ -265,9 +282,10 @@ pytest tests/integration
|
|||||||
|
|
||||||
## Sanity check code before checkin
|
## Sanity check code before checkin
|
||||||
```sh
|
```sh
|
||||||
flake8 src tests
|
# Requries test_requirements.txt to have been installed.
|
||||||
mypy -p gometa
|
|
||||||
black --exclude 'gometa/metadata' -S -t py36 src tests
|
black --exclude 'gometa/metadata' -S -t py36 src tests
|
||||||
isort src tests
|
isort src tests
|
||||||
|
flake8 src tests
|
||||||
|
mypy -p gometa
|
||||||
pytest
|
pytest
|
||||||
```
|
```
|
||||||
|
@ -71,12 +71,13 @@ setuptools.setup(
|
|||||||
"toml>=0.10.0",
|
"toml>=0.10.0",
|
||||||
"pydantic>=1.5.1",
|
"pydantic>=1.5.1",
|
||||||
"requests>=2.25.1",
|
"requests>=2.25.1",
|
||||||
"confluent_kafka[avro]>=1.5.0",
|
|
||||||
"avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master",
|
"avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master",
|
||||||
# Note: we currently require both Avro libraries. The codegen uses avro-python3
|
# Note: we currently require both Avro libraries. The codegen uses avro-python3
|
||||||
# schema parsers at runtime for generating and reading JSON into Python objects.
|
# schema parsers at runtime for generating and reading JSON into Python objects.
|
||||||
# At the same time, we use Kafka's AvroSerializer, which internally relies on
|
# At the same time, we use Kafka's AvroSerializer, which internally relies on
|
||||||
# fastavro for serialization.
|
# fastavro for serialization. We do not use confluent_kafka[avro], since it
|
||||||
|
# is incompatible with its own dep on avro-python3.
|
||||||
|
"confluent_kafka>=1.5.0",
|
||||||
"fastavro>=1.3.0",
|
"fastavro>=1.3.0",
|
||||||
"avro-python3>=1.8.2",
|
"avro-python3>=1.8.2",
|
||||||
"sqlalchemy>=1.3.23", # Required for SQL sources
|
"sqlalchemy>=1.3.23", # Required for SQL sources
|
||||||
|
@ -2,13 +2,13 @@ from typing import Dict, Type
|
|||||||
|
|
||||||
from gometa.ingestion.api.source import Source
|
from gometa.ingestion.api.source import Source
|
||||||
|
|
||||||
from .kafka import KafkaSource
|
|
||||||
|
|
||||||
# from .ldap import LDAPSource
|
# from .ldap import LDAPSource
|
||||||
|
from .bigquery import BigQuerySource
|
||||||
|
from .hive import HiveSource
|
||||||
|
from .kafka import KafkaSource
|
||||||
from .mce_file import MetadataFileSource
|
from .mce_file import MetadataFileSource
|
||||||
from .mssql import SQLServerSource
|
from .mssql import SQLServerSource
|
||||||
from .mysql import MySQLSource
|
from .mysql import MySQLSource
|
||||||
from .hive import HiveSource
|
|
||||||
from .postgres import PostgresSource
|
from .postgres import PostgresSource
|
||||||
from .snowflake import SnowflakeSource
|
from .snowflake import SnowflakeSource
|
||||||
|
|
||||||
@ -18,6 +18,7 @@ source_class_mapping: Dict[str, Type[Source]] = {
|
|||||||
"hive": HiveSource,
|
"hive": HiveSource,
|
||||||
"postgres": PostgresSource,
|
"postgres": PostgresSource,
|
||||||
"snowflake": SnowflakeSource,
|
"snowflake": SnowflakeSource,
|
||||||
|
"bigquery": BigQuerySource,
|
||||||
"kafka": KafkaSource,
|
"kafka": KafkaSource,
|
||||||
# "ldap": LDAPSource,
|
# "ldap": LDAPSource,
|
||||||
"file": MetadataFileSource,
|
"file": MetadataFileSource,
|
||||||
|
21
metadata-ingestion/src/gometa/ingestion/source/bigquery.py
Normal file
21
metadata-ingestion/src/gometa/ingestion/source/bigquery.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
|
class BigQueryConfig(SQLAlchemyConfig):
|
||||||
|
scheme = "bigquery"
|
||||||
|
project_id: Optional[str]
|
||||||
|
|
||||||
|
def get_sql_alchemy_url(self):
|
||||||
|
return f"{self.scheme}://{self.project_id}"
|
||||||
|
|
||||||
|
|
||||||
|
class BigQuerySource(SQLAlchemySource):
|
||||||
|
def __init__(self, config, ctx):
|
||||||
|
super().__init__(config, ctx, "bigquery")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict, ctx):
|
||||||
|
config = BigQueryConfig.parse_obj(config_dict)
|
||||||
|
return cls(config, ctx)
|
@ -1,7 +1,7 @@
|
|||||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
class HiveConfig(SQLAlchemyConfig):
|
class HiveConfig(BasicSQLAlchemyConfig):
|
||||||
# defaults
|
# defaults
|
||||||
scheme = "hive"
|
scheme = "hive"
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
class SQLServerConfig(SQLAlchemyConfig):
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
||||||
# defaults
|
# defaults
|
||||||
host_port = "localhost:1433"
|
host_port = "localhost:1433"
|
||||||
scheme = "mssql+pytds"
|
scheme = "mssql+pytds"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
class MySQLConfig(SQLAlchemyConfig):
|
class MySQLConfig(BasicSQLAlchemyConfig):
|
||||||
# defaults
|
# defaults
|
||||||
host_port = "localhost:3306"
|
host_port = "localhost:3306"
|
||||||
scheme = "mysql+pymysql"
|
scheme = "mysql+pymysql"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
class PostgresConfig(SQLAlchemyConfig):
|
class PostgresConfig(BasicSQLAlchemyConfig):
|
||||||
# defaults
|
# defaults
|
||||||
scheme = "postgresql+psycopg2"
|
scheme = "postgresql+psycopg2"
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeConfig(SQLAlchemyConfig):
|
class SnowflakeConfig(BasicSQLAlchemyConfig):
|
||||||
# defaults
|
# defaults
|
||||||
scheme = "snowflake"
|
scheme = "snowflake"
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
@ -48,17 +49,23 @@ class SQLSourceReport(SourceReport):
|
|||||||
|
|
||||||
|
|
||||||
class SQLAlchemyConfig(BaseModel):
|
class SQLAlchemyConfig(BaseModel):
|
||||||
|
options: Optional[dict] = {}
|
||||||
|
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_sql_alchemy_url(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
||||||
username: str
|
username: str
|
||||||
password: str
|
password: str
|
||||||
host_port: str
|
host_port: str
|
||||||
database: str = ""
|
database: str = ""
|
||||||
scheme: str
|
scheme: str
|
||||||
options: Optional[dict] = {}
|
|
||||||
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
def get_sql_alchemy_url(self):
|
||||||
url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}"
|
url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}"
|
||||||
logger.debug("sql_alchemy_url={url}")
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
@ -145,6 +152,7 @@ class SQLAlchemySource(Source):
|
|||||||
sql_config = self.config
|
sql_config = self.config
|
||||||
platform = self.platform
|
platform = self.platform
|
||||||
url = sql_config.get_sql_alchemy_url()
|
url = sql_config.get_sql_alchemy_url()
|
||||||
|
logger.debug(f"sql_alchemy_url={url}")
|
||||||
engine = create_engine(url, **sql_config.options)
|
engine = create_engine(url, **sql_config.options)
|
||||||
inspector = reflection.Inspector.from_engine(engine)
|
inspector = reflection.Inspector.from_engine(engine)
|
||||||
database = sql_config.database
|
database = sql_config.database
|
||||||
|
Loading…
x
Reference in New Issue
Block a user