mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-26 23:18:25 +00:00
Merge pull request #307 from open-metadata/bigqueryusage-pipeline
Bigquery Usage Pipeline added
This commit is contained in:
commit
c36727aaf9
48
ingestion/examples/workflows/bigquery_usage.json
Normal file
48
ingestion/examples/workflows/bigquery_usage.json
Normal file
@ -0,0 +1,48 @@
|
||||
{
|
||||
"source": {
|
||||
"type": "bigquery-usage",
|
||||
"config": {
|
||||
"project_id": "project_id",
|
||||
"host_port": "https://bigquery.googleapis.com",
|
||||
"username": "gcpuser@project_id.iam.gserviceaccount.com",
|
||||
"service_name": "gcp_bigquery",
|
||||
"duration": 2,
|
||||
"options": {
|
||||
"credentials_path": "examples/creds/bigquery-cred.json"
|
||||
},
|
||||
"service_type": "BigQuery"
|
||||
}
|
||||
},
|
||||
"processor": {
|
||||
"type": "query-parser",
|
||||
"config": {
|
||||
"filter": ""
|
||||
}
|
||||
},
|
||||
"stage": {
|
||||
"type": "table-usage",
|
||||
"config": {
|
||||
"filename": "/tmp/bigquery_usage"
|
||||
}
|
||||
},
|
||||
"bulk_sink": {
|
||||
"type": "metadata-usage",
|
||||
"config": {
|
||||
"filename": "/tmp/bigquery_usage"
|
||||
}
|
||||
},
|
||||
"metadata_server": {
|
||||
"type": "metadata-server",
|
||||
"config": {
|
||||
"api_endpoint": "http://localhost:8585/api",
|
||||
"auth_provider_type": "no-auth"
|
||||
}
|
||||
},
|
||||
"cron": {
|
||||
"minute": "*/5",
|
||||
"hour": null,
|
||||
"day": null,
|
||||
"month": null,
|
||||
"day_of_week": null
|
||||
}
|
||||
}
|
||||
@ -9,7 +9,7 @@
|
||||
"service_name": "aws_redshift",
|
||||
"service_type": "Redshift",
|
||||
"filter_pattern": {
|
||||
"excludes": ["information_schema.*"]
|
||||
"excludes": ["information_schema.*","[\\w]*event_vw.*"]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@ -78,9 +78,9 @@ class MetadataRestTablesSink(Sink):
|
||||
'{}.{}'.format(table_and_db.database.name.__root__, created_table.name.__root__))
|
||||
except (APIError, ValidationError) as err:
|
||||
logger.error(
|
||||
"Failed to ingest table {} in database {} ".format(table_and_db.table.name, table_and_db.database.name))
|
||||
"Failed to ingest table {} in database {} ".format(table_and_db.table.name.__root__, table_and_db.database.name.__root__))
|
||||
logger.error(err)
|
||||
self.status.failure(table_and_db.table.name)
|
||||
self.status.failure(table_and_db.table.name.__root__)
|
||||
|
||||
def get_status(self):
|
||||
return self.status
|
||||
|
||||
@ -26,6 +26,7 @@ from ..ometa.auth_provider import MetadataServerConfig
|
||||
class BigQueryConfig(SQLConnectionConfig, SQLSource):
|
||||
scheme = "bigquery"
|
||||
project_id: Optional[str] = None
|
||||
duration: int = 1
|
||||
|
||||
def get_connection_url(self):
|
||||
if self.project_id:
|
||||
|
||||
87
ingestion/src/metadata/ingestion/source/bigquery_usage.py
Normal file
87
ingestion/src/metadata/ingestion/source/bigquery_usage.py
Normal file
@ -0,0 +1,87 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This import verifies that the dependencies are available.
|
||||
import logging as log
|
||||
from metadata.ingestion.models.table_queries import TableQuery
|
||||
from google.cloud import logging
|
||||
import collections
|
||||
from datetime import datetime
|
||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
||||
from metadata.ingestion.api.source import Source, SourceStatus
|
||||
from typing import Dict, Any, Iterable
|
||||
from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus
|
||||
from metadata.utils.helpers import get_start_and_end
|
||||
from metadata.ingestion.source.bigquery import BigQueryConfig
|
||||
|
||||
logger = log.getLogger(__name__)
|
||||
|
||||
|
||||
class BigqueryUsageSource(Source):
|
||||
SERVICE_TYPE = 'Bigquery'
|
||||
scheme = "bigquery"
|
||||
|
||||
def __init__(self, config, metadata_config, ctx):
|
||||
super().__init__(ctx)
|
||||
|
||||
self.config = config
|
||||
self.project_id = self.config.project_id
|
||||
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
|
||||
self.status = SQLSourceStatus()
|
||||
|
||||
def get_connection_url(self):
|
||||
if self.project_id:
|
||||
print(f"{self.scheme}://{self.project_id}")
|
||||
return f"{self.scheme}://{self.project_id}"
|
||||
return f"{self.scheme}://"
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||
config = BigQueryConfig.parse_obj(config_dict)
|
||||
metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
|
||||
return cls(config, metadata_config, ctx)
|
||||
|
||||
def prepare(self):
|
||||
pass
|
||||
|
||||
def next_record(self) -> Iterable[TableQuery]:
|
||||
logging_client = logging.Client()
|
||||
logger = logging_client.logger(self.logger_name)
|
||||
print("Listing entries for logger {}:".format(logger.name))
|
||||
start, end = get_start_and_end(self.config.duration)
|
||||
for entry in logger.list_entries():
|
||||
timestamp = entry.timestamp.isoformat()
|
||||
timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d")
|
||||
if timestamp >= start and timestamp <= end:
|
||||
if("query" in str(entry.payload)) and type(entry.payload) == collections.OrderedDict:
|
||||
payload = list(entry.payload.items())[-1][1]
|
||||
if "jobChange" in payload:
|
||||
print(f"\nEntries: {payload}")
|
||||
queryConfig = payload['jobChange']['job']['jobConfig']['queryConfig']
|
||||
jobStats = payload['jobChange']['job']['jobStats']
|
||||
statementType = queryConfig['statementType'] if hasattr(queryConfig, 'statementType') else ''
|
||||
database = queryConfig['destinationTable'] if hasattr(queryConfig, 'destinationTable') is not None else ''
|
||||
analysis_date = str(datetime.strptime(jobStats['startTime'][0:19], "%Y-%m-%dT%H:%M:%S").strftime('%Y-%m-%d %H:%M:%S'))
|
||||
tq = TableQuery(statementType,
|
||||
queryConfig['priority'], 0, 0, 0, str(jobStats['startTime']),
|
||||
str(jobStats['endTime']), analysis_date, self.config.duration, str(
|
||||
database), 0, queryConfig['query'])
|
||||
yield tq
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def get_status(self) -> SourceStatus:
|
||||
return self.status
|
||||
@ -48,13 +48,14 @@ class SQLSourceStatus(SourceStatus):
|
||||
success: List[str] = field(default_factory=list)
|
||||
failures: List[str] = field(default_factory=list)
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
filtered: List[str] = field(default_factory=list)
|
||||
|
||||
def scanned(self, table_name: str) -> None:
|
||||
self.success.append(table_name)
|
||||
logger.info('Table Scanned: {}'.format(table_name))
|
||||
|
||||
def filtered(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
|
||||
self.warnings.append(table_name)
|
||||
def filter(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
|
||||
self.filtered.append(table_name)
|
||||
logger.warning("Dropped Table {} due to {}".format(table_name, err))
|
||||
|
||||
|
||||
@ -194,7 +195,7 @@ class SQLSource(Source):
|
||||
inspector = inspect(self.engine)
|
||||
for schema in inspector.get_schema_names():
|
||||
if not self.sql_config.filter_pattern.included(schema):
|
||||
self.status.filtered(schema, "Schema pattern not allowed")
|
||||
self.status.filter(schema, "Schema pattern not allowed")
|
||||
continue
|
||||
logger.debug("total tables {}".format(inspector.get_table_names(schema)))
|
||||
if self.config.include_tables:
|
||||
@ -209,7 +210,7 @@ class SQLSource(Source):
|
||||
try:
|
||||
schema, table_name = self.standardize_schema_table_names(schema, table_name)
|
||||
if not self.sql_config.filter_pattern.included(table_name):
|
||||
self.status.filtered('{}.{}'.format(self.config.get_service_name(), table_name),
|
||||
self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name),
|
||||
"Table pattern not allowed")
|
||||
continue
|
||||
self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name))
|
||||
@ -230,7 +231,7 @@ class SQLSource(Source):
|
||||
yield table_and_db
|
||||
except ValidationError as err:
|
||||
logger.error(err)
|
||||
self.status.filtered('{}.{}'.format(self.config.service_name, table_name),
|
||||
self.status.filter('{}.{}'.format(self.config.service_name, table_name),
|
||||
"Validation error")
|
||||
continue
|
||||
|
||||
@ -240,7 +241,7 @@ class SQLSource(Source):
|
||||
for view_name in inspector.get_view_names(schema):
|
||||
try:
|
||||
if not self.sql_config.filter_pattern.included(view_name):
|
||||
self.status.filtered('{}.{}'.format(self.config.get_service_name(), view_name),
|
||||
self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name),
|
||||
"View pattern not allowed")
|
||||
continue
|
||||
try:
|
||||
|
||||
@ -32,7 +32,11 @@ def get_table_column_join(table, table_aliases, joins):
|
||||
for join in joins:
|
||||
try:
|
||||
if "." in join:
|
||||
jtable, column = join.split(".")
|
||||
if join.count(".") < 3:
|
||||
jtable, column = join.split(".")
|
||||
else:
|
||||
jtable, column = join.split(".")[2:]
|
||||
|
||||
if table == jtable or jtable in table_aliases:
|
||||
table_column = TableColumn(table=table_aliases[jtable] if jtable in table_aliases else jtable,
|
||||
column=column)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user