mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-18 14:16:48 +00:00
feat(ingest): Add config option to set Bigquery credential in source config (#3786)
This commit is contained in:
parent
0f8458ad74
commit
5df5150e51
@ -19,4 +19,4 @@ source:
|
||||
sink:
|
||||
type: "datahub-rest"
|
||||
config:
|
||||
server: "https://autotrader.acryl.io/gms"
|
||||
server: "http://localhost:8080"
|
47
metadata-ingestion/examples/recipes/bigquery_to_datahub.yml
Normal file
47
metadata-ingestion/examples/recipes/bigquery_to_datahub.yml
Normal file
@ -0,0 +1,47 @@
|
||||
---
|
||||
# see https://datahubproject.io/docs/metadata-ingestion/source_docs/bigquery for complete documentation
|
||||
source:
|
||||
type: "redshift"
|
||||
config:
|
||||
## Coordinates
|
||||
project_id: project-id-1234567
|
||||
## Credentials
|
||||
## If GOOGLE_APPLICATION_CREDENTIALS environment variable is not set you can specify credentials here
|
||||
#credential:
|
||||
# project_id: project-id-1234567
|
||||
# private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0"
|
||||
# private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n"
|
||||
# client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com"
|
||||
# client_id: "123456678890"
|
||||
#include_tables: true
|
||||
#include_views: true
|
||||
#include_table_lineage: true
|
||||
#start_time: 2021-12-15T20:08:23.091Z
|
||||
#end_time: 2023-12-15T20:08:23.091Z
|
||||
#profiling:
|
||||
# enabled: true
|
||||
# turn_off_expensive_profiling_metrics: false
|
||||
# query_combiner_enabled: true
|
||||
# max_number_of_fields_to_profile: 8
|
||||
# profile_table_level_only: false
|
||||
# include_field_null_count: true
|
||||
# include_field_min_value: true
|
||||
# include_field_max_value: true
|
||||
# include_field_mean_value: true
|
||||
# include_field_median_value: true
|
||||
# include_field_stddev_value: false
|
||||
# include_field_quantiles: false
|
||||
# include_field_distinct_value_frequencies: false
|
||||
# include_field_histogram: false
|
||||
# include_field_sample_values: false
|
||||
#profile_pattern:
|
||||
# allow:
|
||||
# - "schema.table.column"
|
||||
# deny:
|
||||
# - "*.*.*"
|
||||
|
||||
## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
|
||||
sink:
|
||||
type: "datahub-rest"
|
||||
config:
|
||||
server: "http://localhost:8080"
|
@ -6,6 +6,64 @@ For context on getting started with ingestion, check out our [metadata ingestion
|
||||
|
||||
To install this plugin, run `pip install 'acryl-datahub[bigquery]'`.
|
||||
|
||||
## Prerequisites
|
||||
### Create a datahub profile in GCP:
|
||||
1. Create a custom role for datahub (https://cloud.google.com/iam/docs/creating-custom-roles#creating_a_custom_role)
|
||||
2. Grant the following permissions to this role:
|
||||
```
|
||||
bigquery.datasets.get
|
||||
bigquery.datasets.getIamPolicy
|
||||
bigquery.jobs.create
|
||||
bigquery.jobs.list
|
||||
bigquery.jobs.listAll
|
||||
bigquery.models.getMetadata
|
||||
bigquery.models.list
|
||||
bigquery.routines.get
|
||||
bigquery.routines.list
|
||||
bigquery.tables.create # Needs for profiling
|
||||
bigquery.tables.get
|
||||
bigquery.tables.getData # Needs for profiling
|
||||
bigquery.tables.list
|
||||
logging.logEntries.list # Needs for lineage generation
|
||||
resourcemanager.projects.get
|
||||
```
|
||||
### Create a service account:
|
||||
|
||||
1. Setup a ServiceAccount (https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console)
|
||||
and assign the previously created role to this service account.
|
||||
2. Download a service account JSON keyfile.
|
||||
Example credential file:
|
||||
```json
|
||||
{
|
||||
"type": "service_account",
|
||||
"project_id": "project-id-1234567",
|
||||
"private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0",
|
||||
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----",
|
||||
"client_email": "test@suppproject-id-1234567.iam.gserviceaccount.com",
|
||||
"client_id": "113545814931671546333",
|
||||
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||
"token_uri": "https://oauth2.googleapis.com/token",
|
||||
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com"
|
||||
}
|
||||
```
|
||||
3. To provide credentials to the source, you can either:
|
||||
Set an environment variable:
|
||||
$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"
|
||||
|
||||
*or*
|
||||
|
||||
Set credential config in your source based on the credential json file. For example:
|
||||
|
||||
```yml
|
||||
credential:
|
||||
project_id: project-id-1234567
|
||||
private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0"
|
||||
private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n"
|
||||
client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com"
|
||||
client_id: "123456678890"
|
||||
```
|
||||
|
||||
## Capabilities
|
||||
|
||||
This plugin extracts the following:
|
||||
@ -44,30 +102,34 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
|
||||
As a SQL-based service, the Athena integration is also supported by our SQL profiler. See [here](./sql_profiles.md) for more details on configuration.
|
||||
|
||||
| Field | Required | Default | Description |
|
||||
| ----------------------------------------- | -------- | ------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_id` | | Autodetected | Project ID to ingest from. If not specified, will infer from environment. |
|
||||
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
|
||||
| `options.<option>` | | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||
| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. |
|
||||
| `table_pattern.deny` | | | List of regex patterns for tables to exclude from ingestion. |
|
||||
| `table_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `schema_pattern.allow` | | | List of regex patterns for schemas to include in ingestion. |
|
||||
| `schema_pattern.deny` | | | List of regex patterns for schemas to exclude from ingestion. |
|
||||
| `schema_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `view_pattern.allow` | | | List of regex patterns for views to include in ingestion. |
|
||||
| `view_pattern.deny` | | | List of regex patterns for views to exclude from ingestion. |
|
||||
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `include_tables` | | `True` | Whether tables should be ingested. |
|
||||
| `include_views` | | `True` | Whether views should be ingested. |
|
||||
| `include_table_lineage` | | `True` | Whether table level lineage should be ingested and processed. |
|
||||
| `max_query_duration` | | `15` | A time buffer in minutes to adjust start_time and end_time while querying Bigquery audit logs. |
|
||||
| `start_time` | | Start of last full day in UTC (or hour, depending on `bucket_duration`) | Earliest time of lineage data to consider. |
|
||||
| `end_time` | | End of last full day in UTC (or hour, depending on `bucket_duration`) | Latest time of lineage data to consider. |
|
||||
| `extra_client_options` | | | Additional options to pass to `google.cloud.logging_v2.client.Client`. |
|
||||
| `use_exported_bigquery_audit_metadata` | | `False` | When configured, use `BigQueryAuditMetadata` in `bigquery_audit_metadata_datasets` to compute lineage information. |
|
||||
| `use_date_sharded_audit_log_tables` | | `False` | Whether to read date sharded tables or time partitioned tables when extracting lineage from exported audit logs. |
|
||||
| `bigquery_audit_metadata_datasets` | | None | A list of datasets that contain a table named `cloudaudit_googleapis_com_data_access` which contain BigQuery audit logs, specifically, those containing `BigQueryAuditMetadata`. It is recommended that the project of the dataset is also specified, for example, `projectA.datasetB`. |
|
||||
| Field | Required | Default | Description |
|
||||
|----------------------------------------|---------------------------------------------------------------------------|-------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `project_id` | | Autodetected | Project ID to ingest from. If not specified, will infer from environment. |
|
||||
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
|
||||
| `credential.project_id` | Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set | | |
|
||||
| `credential.private_key_id` | Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||
| `credential.private_key` | Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||
| `credential.client_email` | Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||
| `credential.client_id` | Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set | | Any options specified here will be passed to SQLAlchemy's `create_engine` as kwargs.<br />See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details. |
|
||||
| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. |
|
||||
| `table_pattern.deny` | | | List of regex patterns for tables to exclude from ingestion. |
|
||||
| `table_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `schema_pattern.allow` | | | List of regex patterns for schemas to include in ingestion. |
|
||||
| `schema_pattern.deny` | | | List of regex patterns for schemas to exclude from ingestion. |
|
||||
| `schema_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `view_pattern.allow` | | | List of regex patterns for views to include in ingestion. |
|
||||
| `view_pattern.deny` | | | List of regex patterns for views to exclude from ingestion. |
|
||||
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `include_tables` | | `True` | Whether tables should be ingested. |
|
||||
| `include_views` | | `True` | Whether views should be ingested. |
|
||||
| `include_table_lineage` | | `True` | Whether table level lineage should be ingested and processed. |
|
||||
| `max_query_duration` | | `15` | A time buffer in minutes to adjust start_time and end_time while querying Bigquery audit logs. |
|
||||
| `start_time` | | Start of last full day in UTC (or hour, depending on `bucket_duration`) | Earliest time of lineage data to consider. |
|
||||
| `end_time` | | End of last full day in UTC (or hour, depending on `bucket_duration`) | Latest time of lineage data to consider. |
|
||||
| `extra_client_options` | | | Additional options to pass to `google.cloud.logging_v2.client.Client`. |
|
||||
| `use_exported_bigquery_audit_metadata` | | `False` | When configured, use `BigQueryAuditMetadata` in `bigquery_audit_metadata_datasets` to compute lineage information. |
|
||||
| `use_date_sharded_audit_log_tables` | | `False` | Whether to read date sharded tables or time partitioned tables when extracting lineage from exported audit logs. |
|
||||
| `bigquery_audit_metadata_datasets` | | None | A list of datasets that contain a table named `cloudaudit_googleapis_com_data_access` which contain BigQuery audit logs, specifically, those containing `BigQueryAuditMetadata`. It is recommended that the project of the dataset is also specified, for example, `projectA.datasetB`. |
|
||||
|
||||
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
import collections
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import textwrap
|
||||
from datetime import timedelta
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||
@ -14,6 +17,7 @@ from google.cloud.bigquery import Client as BigQueryClient
|
||||
from google.cloud.logging_v2.client import Client as GCPLoggingClient
|
||||
from sqlalchemy.engine.reflection import Inspector
|
||||
|
||||
from datahub.configuration import ConfigModel
|
||||
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
||||
from datahub.emitter import mce_builder
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
@ -140,20 +144,59 @@ register_custom_type(GEOGRAPHY)
|
||||
assert pybigquery.sqlalchemy_bigquery._type_map
|
||||
|
||||
|
||||
class BigQueryCredential(ConfigModel):
|
||||
project_id: str
|
||||
private_key_id: str
|
||||
private_key: str
|
||||
client_email: str
|
||||
client_id: str
|
||||
auth_uri: str = "https://accounts.google.com/o/oauth2/auth"
|
||||
token_uri: str = "https://oauth2.googleapis.com/token"
|
||||
auth_provider_x509_cert_url: str = "https://www.googleapis.com/oauth2/v1/certs"
|
||||
type: str = "service_account"
|
||||
client_x509_cert_url: Optional[str]
|
||||
|
||||
def __init__(self, **data: Any):
|
||||
super().__init__(**data) # type: ignore
|
||||
if not self.client_x509_cert_url:
|
||||
self.client_x509_cert_url = (
|
||||
f"https://www.googleapis.com/robot/v1/metadata/x509/{self.client_email}"
|
||||
)
|
||||
|
||||
|
||||
def create_credential_temp_file(credential: BigQueryCredential) -> str:
|
||||
with tempfile.NamedTemporaryFile(delete=False) as fp:
|
||||
cred_json = json.dumps(credential.dict(), indent=4, separators=(",", ": "))
|
||||
fp.write(cred_json.encode())
|
||||
return fp.name
|
||||
|
||||
|
||||
class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
|
||||
scheme: str = "bigquery"
|
||||
project_id: Optional[str] = None
|
||||
|
||||
log_page_size: Optional[pydantic.PositiveInt] = 1000
|
||||
credential: Optional[BigQueryCredential]
|
||||
# extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
|
||||
extra_client_options: Dict[str, Any] = {}
|
||||
include_table_lineage: Optional[bool] = True
|
||||
max_query_duration: timedelta = timedelta(minutes=15)
|
||||
|
||||
credentials_path: Optional[str] = None
|
||||
bigquery_audit_metadata_datasets: Optional[List[str]] = None
|
||||
use_exported_bigquery_audit_metadata: bool = False
|
||||
use_date_sharded_audit_log_tables: bool = False
|
||||
|
||||
def __init__(self, **data: Any):
|
||||
super().__init__(**data)
|
||||
|
||||
if self.credential:
|
||||
self.credentials_path = create_credential_temp_file(self.credential)
|
||||
logger.debug(
|
||||
f"Creating temporary credential file at {self.credentials_path}"
|
||||
)
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
|
||||
|
||||
def get_sql_alchemy_url(self):
|
||||
if self.project_id:
|
||||
return f"{self.scheme}://{self.project_id}"
|
||||
@ -479,3 +522,11 @@ class BigQuerySource(SQLAlchemySource):
|
||||
if segments[0] != schema:
|
||||
raise ValueError(f"schema {schema} does not match table {entity}")
|
||||
return segments[0], segments[1]
|
||||
|
||||
# We can't use close as it is not called if the ingestion is not successful
|
||||
def __del__(self):
|
||||
if self.config.credentials_path:
|
||||
logger.debug(
|
||||
f"Deleting temporary credential file at {self.config.credentials_path}"
|
||||
)
|
||||
os.unlink(self.config.credentials_path)
|
||||
|
65
metadata-ingestion/tests/unit/test_bigquery_source.py
Normal file
65
metadata-ingestion/tests/unit/test_bigquery_source.py
Normal file
@ -0,0 +1,65 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigquery_uri():
|
||||
from datahub.ingestion.source.sql.bigquery import BigQueryConfig
|
||||
|
||||
config = BigQueryConfig.parse_obj(
|
||||
{
|
||||
"project_id": "test-project",
|
||||
}
|
||||
)
|
||||
assert config.get_sql_alchemy_url() == "bigquery://test-project"
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigquery_uri_with_credential():
|
||||
from datahub.ingestion.source.sql.bigquery import BigQueryConfig
|
||||
|
||||
expected_credential_json = {
|
||||
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||
"client_email": "test@acryl.io",
|
||||
"client_id": "test_client-id",
|
||||
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test@acryl.io",
|
||||
"private_key": "random_private_key",
|
||||
"private_key_id": "test-private-key",
|
||||
"project_id": "test-project",
|
||||
"token_uri": "https://oauth2.googleapis.com/token",
|
||||
"type": "service_account",
|
||||
}
|
||||
|
||||
config = BigQueryConfig.parse_obj(
|
||||
{
|
||||
"project_id": "test-project",
|
||||
"credential": {
|
||||
"project_id": "test-project",
|
||||
"private_key_id": "test-private-key",
|
||||
"private_key": "random_private_key",
|
||||
"client_email": "test@acryl.io",
|
||||
"client_id": "test_client-id",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
assert config.get_sql_alchemy_url() == "bigquery://test-project"
|
||||
assert config.credentials_path
|
||||
|
||||
with open(config.credentials_path) as jsonFile:
|
||||
json_credential = json.load(jsonFile)
|
||||
jsonFile.close()
|
||||
|
||||
credential = json.dumps(json_credential, sort_keys=True)
|
||||
expected_credential = json.dumps(expected_credential_json, sort_keys=True)
|
||||
assert expected_credential == credential
|
||||
|
||||
except AssertionError as e:
|
||||
if config.credentials_path:
|
||||
os.unlink(str(config.credentials_path))
|
||||
raise e
|
Loading…
x
Reference in New Issue
Block a user