Add bigquery and refactor others

2025-10-04 05:26:24 +00:00 · 2021-02-15 14:39:59 -08:00 · 2021-02-15 14:39:59 -08:00 · b91d0cf63b
commit b91d0cf63b
parent cbbdf0930a
11 changed files with 73 additions and 21 deletions
--- a/metadata-ingestion/CHANGELOG
+++ b/metadata-ingestion/CHANGELOG
@ -0,0 +1,3 @@
+0.0.1
+-----
+* Modernizing python scripts and creating first package
--- a/metadata-ingestion/README.md
+++ b/metadata-ingestion/README.md
@ -160,7 +160,7 @@ Extracts:
 - List of databases, schema, and tables
 - Column types associated with each table

-Extra requirements: `pip install psycopg2-binary`
+Extra requirements: `pip install psycopg2-binary` or `pip install psycopg2`

 ```yml
 source:
@ -190,6 +190,23 @@ source:
    # table_pattern is same as above
 ```

+## Google BigQuery `bigquery`
+Extracts:
+- List of databases, schema, and tables
+- Column types associated with each table
+
+Extra requirements: `pip install pybigquery`
+
+```yml
+source:
+  type: snowflake
+  config:
+    project_id: project
+    options:
+      credential_path: "/path/to/keyfile.json"
+    # table_pattern is same as above
+```
+
 ## File `file`
 Pulls metadata from a previously generated file. Note that the file sink
 can produce such files, and a number of samples are included in the
@ -265,9 +282,10 @@ pytest tests/integration

 ## Sanity check code before checkin
 ```sh
-flake8 src tests
-mypy -p gometa
+# Requries test_requirements.txt to have been installed.
 black --exclude 'gometa/metadata' -S -t py36 src tests
 isort src tests
+flake8 src tests
+mypy -p gometa
 pytest
 ```
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -71,12 +71,13 @@ setuptools.setup(
        "toml>=0.10.0",
        "pydantic>=1.5.1",
        "requests>=2.25.1",
-        "confluent_kafka[avro]>=1.5.0",
        "avro_gen @ https://api.github.com/repos/hsheth2/avro_gen/tarball/master",
        # Note: we currently require both Avro libraries. The codegen uses avro-python3
        # schema parsers at runtime for generating and reading JSON into Python objects.
        # At the same time, we use Kafka's AvroSerializer, which internally relies on
-        # fastavro for serialization.
+        # fastavro for serialization. We do not use confluent_kafka[avro], since it
+        # is incompatible with its own dep on avro-python3.
+        "confluent_kafka>=1.5.0",
        "fastavro>=1.3.0",
        "avro-python3>=1.8.2",
        "sqlalchemy>=1.3.23",  # Required for SQL sources
--- a/metadata-ingestion/src/gometa/ingestion/source/init.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/init.py
@ -2,13 +2,13 @@ from typing import Dict, Type

 from gometa.ingestion.api.source import Source

-from .kafka import KafkaSource
-
 # from .ldap import LDAPSource
+from .bigquery import BigQuerySource
+from .hive import HiveSource
+from .kafka import KafkaSource
 from .mce_file import MetadataFileSource
 from .mssql import SQLServerSource
 from .mysql import MySQLSource
-from .hive import HiveSource
 from .postgres import PostgresSource
 from .snowflake import SnowflakeSource

@ -18,6 +18,7 @@ source_class_mapping: Dict[str, Type[Source]] = {
    "hive": HiveSource,
    "postgres": PostgresSource,
    "snowflake": SnowflakeSource,
+    "bigquery": BigQuerySource,
    "kafka": KafkaSource,
    # "ldap": LDAPSource,
    "file": MetadataFileSource,
--- a/metadata-ingestion/src/gometa/ingestion/source/bigquery.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/bigquery.py
@ -0,0 +1,21 @@
+from typing import Optional
+
+from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+
+
+class BigQueryConfig(SQLAlchemyConfig):
+    scheme = "bigquery"
+    project_id: Optional[str]
+
+    def get_sql_alchemy_url(self):
+        return f"{self.scheme}://{self.project_id}"
+
+
+class BigQuerySource(SQLAlchemySource):
+    def __init__(self, config, ctx):
+        super().__init__(config, ctx, "bigquery")
+
+    @classmethod
+    def create(cls, config_dict, ctx):
+        config = BigQueryConfig.parse_obj(config_dict)
+        return cls(config, ctx)
--- a/metadata-ingestion/src/gometa/ingestion/source/hive.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/hive.py
@ -1,7 +1,7 @@
-from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource


-class HiveConfig(SQLAlchemyConfig):
+class HiveConfig(BasicSQLAlchemyConfig):
    # defaults
    scheme = "hive"

--- a/metadata-ingestion/src/gometa/ingestion/source/mssql.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/mssql.py
@ -1,7 +1,7 @@
-from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource


-class SQLServerConfig(SQLAlchemyConfig):
+class SQLServerConfig(BasicSQLAlchemyConfig):
    # defaults
    host_port = "localhost:1433"
    scheme = "mssql+pytds"
--- a/metadata-ingestion/src/gometa/ingestion/source/mysql.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/mysql.py
@ -1,7 +1,7 @@
-from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource


-class MySQLConfig(SQLAlchemyConfig):
+class MySQLConfig(BasicSQLAlchemyConfig):
    # defaults
    host_port = "localhost:3306"
    scheme = "mysql+pymysql"
--- a/metadata-ingestion/src/gometa/ingestion/source/postgres.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/postgres.py
@ -1,7 +1,7 @@
-from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource


-class PostgresConfig(SQLAlchemyConfig):
+class PostgresConfig(BasicSQLAlchemyConfig):
    # defaults
    scheme = "postgresql+psycopg2"

--- a/metadata-ingestion/src/gometa/ingestion/source/snowflake.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/snowflake.py
@ -1,7 +1,7 @@
-from .sql_common import SQLAlchemyConfig, SQLAlchemySource
+from .sql_common import BasicSQLAlchemyConfig, SQLAlchemySource


-class SnowflakeConfig(SQLAlchemyConfig):
+class SnowflakeConfig(BasicSQLAlchemyConfig):
    # defaults
    scheme = "snowflake"

--- a/metadata-ingestion/src/gometa/ingestion/source/sql_common.py
+++ b/metadata-ingestion/src/gometa/ingestion/source/sql_common.py
@ -1,5 +1,6 @@
 import logging
 import time
+from abc import abstractmethod
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional

@ -48,17 +49,23 @@ class SQLSourceReport(SourceReport):


 class SQLAlchemyConfig(BaseModel):
+    options: Optional[dict] = {}
+    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
+
+    @abstractmethod
+    def get_sql_alchemy_url(self):
+        pass
+
+
+class BasicSQLAlchemyConfig(SQLAlchemyConfig):
    username: str
    password: str
    host_port: str
    database: str = ""
    scheme: str
-    options: Optional[dict] = {}
-    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    def get_sql_alchemy_url(self):
        url = f"{self.scheme}://{self.username}:{self.password}@{self.host_port}/{self.database}"
-        logger.debug("sql_alchemy_url={url}")
        return url


@ -145,6 +152,7 @@ class SQLAlchemySource(Source):
        sql_config = self.config
        platform = self.platform
        url = sql_config.get_sql_alchemy_url()
+        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        database = sql_config.database