Add bigquery and refactor others

This commit is contained in:
Harshal Sheth 2021-02-15 14:39:59 -08:00 committed by Shirshanka Das
parent cbbdf0930a
commit b91d0cf63b
11 changed files with 73 additions and 21 deletions

View File

@ -0,0 +1,3 @@
0.0.1
-----
* Modernizing python scripts and creating first package

View File

@ -160,7 +160,7 @@ Extracts:
- List of databases, schema, and tables - List of databases, schema, and tables
- Column types associated with each table - Column types associated with each table
Extra requirements: `pip install psycopg2-binary` Extra requirements: `pip install psycopg2-binary` or `pip install psycopg2`
```yml ```yml
source: source:
@ -190,6 +190,23 @@ source:
# table_pattern is same as above # table_pattern is same as above
``` ```
## Google BigQuery `bigquery`
Extracts:
- List of databases, schema, and tables
- Column types associated with each table
Extra requirements: `pip install pybigquery`
```yml
source:
type: snowflake
config:
project_id: project
options:
credential_path: "/path/to/keyfile.json"
# table_pattern is same as above
```
## File `file` ## File `file`
Pulls metadata from a previously generated file. Note that the file sink Pulls metadata from a previously generated file. Note that the file sink
can produce such files, and a number of samples are included in the can produce such files, and a number of samples are included in the
@ -265,9 +282,10 @@ pytest tests/integration
## Sanity check code before checkin ## Sanity check code before checkin
```sh ```sh
flake8 src tests # Requries test_requirements.txt to have been installed.
mypy -p gometa
black --exclude 'gometa/metadata' -S -t py36 src tests black --exclude 'gometa/metadata' -S -t py36 src tests
isort src tests isort src tests
flake8 src tests
mypy -p gometa
pytest pytest
``` ```

View File

@ -71,12 +71,13 @@ setuptools.setup(
"toml>=0.10.0", "toml>=0.10.0",
"pydantic>=1.5.1", "pydantic>=1.5.1",
"requests>=2.25.1", "requests>=2.25.1",
"confluent_kafka[avro]>=1.5.0",
"avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master", "avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master",
# Note: we currently require both Avro libraries. The codegen uses avro-python3 # Note: we currently require both Avro libraries. The codegen uses avro-python3
# schema parsers at runtime for generating and reading JSON into Python objects. # schema parsers at runtime for generating and reading JSON into Python objects.
# At the same time, we use Kafka's AvroSerializer, which internally relies on # At the same time, we use Kafka's AvroSerializer, which internally relies on
# fastavro for serialization. # fastavro for serialization. We do not use confluent_kafka[avro], since it
# is incompatible with its own dep on avro-python3.
"confluent_kafka>=1.5.0",
"fastavro>=1.3.0", "fastavro>=1.3.0",
"avro-python3>=1.8.2", "avro-python3>=1.8.2",
"sqlalchemy>=1.3.23", # Required for SQL sources "sqlalchemy>=1.3.23", # Required for SQL sources

View File

@ -2,13 +2,13 @@ from typing import Dict, Type
from gometa.ingestion.api.source import Source from gometa.ingestion.api.source import Source
from .kafka import KafkaSource
# from .ldap import LDAPSource # from .ldap import LDAPSource
from .bigquery import BigQuerySource
from .hive import HiveSource
from .kafka import KafkaSource
from .mce_file import MetadataFileSource from .mce_file import MetadataFileSource
from .mssql import SQLServerSource from .mssql import SQLServerSource
from .mysql import MySQLSource from .mysql import MySQLSource
from .hive import HiveSource
from .postgres import PostgresSource from .postgres import PostgresSource
from .snowflake import SnowflakeSource from .snowflake import SnowflakeSource
@ -18,6 +18,7 @@ source_class_mapping: Dict[str, Type[Source]] = {
"hive": HiveSource, "hive": HiveSource,
"postgres": PostgresSource, "postgres": PostgresSource,
"snowflake": SnowflakeSource, "snowflake": SnowflakeSource,
"bigquery": BigQuerySource,
"kafka": KafkaSource, "kafka": KafkaSource,
# "ldap": LDAPSource, # "ldap": LDAPSource,
"file": MetadataFileSource, "file": MetadataFileSource,

View File

@ -0,0 +1,21 @@
from typing import Optional
from .sql_common import SQLAlchemyConfig, SQLAlchemySource
class BigQueryConfig(SQLAlchemyConfig):
scheme = "bigquery"
project_id: Optional[str]
def get_sql_alchemy_url(self):
return f"{self.scheme}://{self.project_id}"
class BigQuerySource(SQLAlchemySource):
def __init__(self, config, ctx):
super().__init__(config, ctx, "bigquery")
@classmethod
def create(cls, config_dict, ctx):
config = BigQueryConfig.parse_obj(config_dict)
return cls(config, ctx)

View File

@ -1,7 +1,7 @@
from .sql_common import SQLAlchemyConfig, SQLAlchemySource from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
class HiveConfig(SQLAlchemyConfig): class HiveConfig(BasicSQLAlchemyConfig):
# defaults # defaults
scheme = "hive" scheme = "hive"

View File

@ -1,7 +1,7 @@
from .sql_common import SQLAlchemyConfig, SQLAlchemySource from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
class SQLServerConfig(SQLAlchemyConfig): class SQLServerConfig(BasicSQLAlchemyConfig):
# defaults # defaults
host_port = "localhost:1433" host_port = "localhost:1433"
scheme = "mssql+pytds" scheme = "mssql+pytds"

View File

@ -1,7 +1,7 @@
from .sql_common import SQLAlchemyConfig, SQLAlchemySource from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
class MySQLConfig(SQLAlchemyConfig): class MySQLConfig(BasicSQLAlchemyConfig):
# defaults # defaults
host_port = "localhost:3306" host_port = "localhost:3306"
scheme = "mysql+pymysql" scheme = "mysql+pymysql"

View File

@ -1,7 +1,7 @@
from .sql_common import SQLAlchemyConfig, SQLAlchemySource from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
class PostgresConfig(SQLAlchemyConfig): class PostgresConfig(BasicSQLAlchemyConfig):
# defaults # defaults
scheme = "postgresql+psycopg2" scheme = "postgresql+psycopg2"

View File

@ -1,7 +1,7 @@
from .sql_common import SQLAlchemyConfig, SQLAlchemySource from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource
class SnowflakeConfig(SQLAlchemyConfig): class SnowflakeConfig(BasicSQLAlchemyConfig):
# defaults # defaults
scheme = "snowflake" scheme = "snowflake"

View File

@ -1,5 +1,6 @@
import logging import logging
import time import time
from abc import abstractmethod
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@ -48,17 +49,23 @@ class SQLSourceReport(SourceReport):
class SQLAlchemyConfig(BaseModel): class SQLAlchemyConfig(BaseModel):
options: Optional[dict] = {}
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
@abstractmethod
def get_sql_alchemy_url(self):
pass
class BasicSQLAlchemyConfig(SQLAlchemyConfig):
username: str username: str
password: str password: str
host_port: str host_port: str
database: str = "" database: str = ""
scheme: str scheme: str
options: Optional[dict] = {}
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
def get_sql_alchemy_url(self): def get_sql_alchemy_url(self):
url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}" url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}"
logger.debug("sql_alchemy_url={url}")
return url return url
@ -145,6 +152,7 @@ class SQLAlchemySource(Source):
sql_config = self.config sql_config = self.config
platform = self.platform platform = self.platform
url = sql_config.get_sql_alchemy_url() url = sql_config.get_sql_alchemy_url()
logger.debug(f"sql_alchemy_url={url}")
engine = create_engine(url, **sql_config.options) engine = create_engine(url, **sql_config.options)
inspector = reflection.Inspector.from_engine(engine) inspector = reflection.Inspector.from_engine(engine)
database = sql_config.database database = sql_config.database