Merge pull request #307 from open-metadata/bigqueryusage-pipeline

Bigquery Usage Pipeline added
2025-12-26 23:18:25 +00:00 · 2021-08-26 22:24:09 -07:00 · 2021-08-26 22:24:09 -07:00 · c36727aaf9
commit c36727aaf9
parent aa0ab362e6 3f4b938b20
7 changed files with 151 additions and 10 deletions
--- a/ingestion/examples/workflows/bigquery_usage.json
+++ b/ingestion/examples/workflows/bigquery_usage.json
@ -0,0 +1,48 @@
+{
+  "source": {
+    "type": "bigquery-usage",
+    "config": {
+      "project_id": "project_id",
+      "host_port": "https://bigquery.googleapis.com",
+      "username": "gcpuser@project_id.iam.gserviceaccount.com",
+      "service_name": "gcp_bigquery",
+      "duration": 2,
+      "options": {
+        "credentials_path": "examples/creds/bigquery-cred.json"
+      },
+      "service_type": "BigQuery"
+    }
+  },
+  "processor": {
+    "type": "query-parser",
+    "config": {
+      "filter": ""
+    }
+  },
+  "stage": {
+    "type": "table-usage",
+    "config": {
+      "filename": "/tmp/bigquery_usage"
+    }
+  },
+  "bulk_sink": {
+    "type": "metadata-usage",
+    "config": {
+      "filename": "/tmp/bigquery_usage"
+    }
+  },
+  "metadata_server": {
+    "type": "metadata-server",
+    "config": {
+      "api_endpoint": "http://localhost:8585/api",
+      "auth_provider_type": "no-auth"
+    }
+  },
+  "cron": {
+    "minute": "*/5",
+    "hour": null,
+    "day": null,
+    "month": null,
+    "day_of_week": null
+  }
+}
--- a/ingestion/examples/workflows/redshift.json
+++ b/ingestion/examples/workflows/redshift.json
@ -9,7 +9,7 @@
      "service_name": "aws_redshift",
      "service_type": "Redshift",
      "filter_pattern": {
-        "excludes": ["information_schema.*"]
+        "excludes": ["information_schema.*","[\\w]*event_vw.*"]
      }
    }
  },
--- a/ingestion/src/metadata/ingestion/sink/metadata_rest_tables.py
+++ b/ingestion/src/metadata/ingestion/sink/metadata_rest_tables.py
@ -78,9 +78,9 @@ class MetadataRestTablesSink(Sink):
                '{}.{}'.format(table_and_db.database.name.__root__, created_table.name.__root__))
        except (APIError, ValidationError) as err:
            logger.error(
-                "Failed to ingest table {} in database {} ".format(table_and_db.table.name, table_and_db.database.name))
+                "Failed to ingest table {} in database {} ".format(table_and_db.table.name.__root__, table_and_db.database.name.__root__))
            logger.error(err)
-            self.status.failure(table_and_db.table.name)
+            self.status.failure(table_and_db.table.name.__root__)

    def get_status(self):
        return self.status
--- a/ingestion/src/metadata/ingestion/source/bigquery.py
+++ b/ingestion/src/metadata/ingestion/source/bigquery.py
@ -26,6 +26,7 @@ from ..ometa.auth_provider import MetadataServerConfig
 class BigQueryConfig(SQLConnectionConfig, SQLSource):
    scheme = "bigquery"
    project_id: Optional[str] = None
+    duration: int = 1

    def get_connection_url(self):
        if self.project_id:
--- a/ingestion/src/metadata/ingestion/source/bigquery_usage.py
+++ b/ingestion/src/metadata/ingestion/source/bigquery_usage.py
@ -0,0 +1,87 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License. You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# This import verifies that the dependencies are available.
+import logging as log
+from metadata.ingestion.models.table_queries import TableQuery
+from google.cloud import logging
+import collections
+from datetime import datetime
+from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
+from metadata.ingestion.api.source import Source, SourceStatus
+from typing import Dict, Any, Iterable
+from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus
+from metadata.utils.helpers import get_start_and_end
+from metadata.ingestion.source.bigquery import BigQueryConfig
+
+logger = log.getLogger(__name__)
+
+
+class BigqueryUsageSource(Source):
+    SERVICE_TYPE = 'Bigquery'
+    scheme = "bigquery"
+
+    def __init__(self, config, metadata_config, ctx):
+        super().__init__(ctx)
+
+        self.config = config
+        self.project_id = self.config.project_id
+        self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
+        self.status = SQLSourceStatus()
+
+    def get_connection_url(self):
+        if self.project_id:
+            print(f"{self.scheme}://{self.project_id}")
+            return f"{self.scheme}://{self.project_id}"
+        return f"{self.scheme}://"
+
+    @classmethod
+    def create(cls, config_dict, metadata_config_dict, ctx):
+        config = BigQueryConfig.parse_obj(config_dict)
+        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
+        return cls(config, metadata_config, ctx)
+
+    def prepare(self):
+        pass
+
+    def next_record(self) -> Iterable[TableQuery]:
+        logging_client = logging.Client()
+        logger = logging_client.logger(self.logger_name)
+        print("Listing entries for logger {}:".format(logger.name))
+        start, end = get_start_and_end(self.config.duration)
+        for entry in logger.list_entries():
+            timestamp = entry.timestamp.isoformat()
+            timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d")
+            if timestamp >= start and timestamp <= end:
+                if("query" in str(entry.payload)) and type(entry.payload) == collections.OrderedDict:
+                    payload = list(entry.payload.items())[-1][1]
+                    if "jobChange" in payload:
+                        print(f"\nEntries: {payload}")
+                        queryConfig = payload['jobChange']['job']['jobConfig']['queryConfig']
+                        jobStats = payload['jobChange']['job']['jobStats']
+                        statementType = queryConfig['statementType'] if hasattr(queryConfig, 'statementType') else ''
+                        database = queryConfig['destinationTable'] if hasattr(queryConfig, 'destinationTable') is not None else ''
+                        analysis_date = str(datetime.strptime(jobStats['startTime'][0:19], "%Y-%m-%dT%H:%M:%S").strftime('%Y-%m-%d %H:%M:%S'))
+                        tq = TableQuery(statementType,
+                                        queryConfig['priority'], 0, 0, 0, str(jobStats['startTime']),
+                                        str(jobStats['endTime']), analysis_date, self.config.duration, str(
+                                            database), 0, queryConfig['query'])
+                        yield tq
+
+    def close(self):
+        pass
+
+    def get_status(self) -> SourceStatus:
+        return self.status
--- a/ingestion/src/metadata/ingestion/source/sql_source.py
+++ b/ingestion/src/metadata/ingestion/source/sql_source.py
@ -48,13 +48,14 @@ class SQLSourceStatus(SourceStatus):
    success: List[str] = field(default_factory=list)
    failures: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
+    filtered: List[str] = field(default_factory=list)

    def scanned(self, table_name: str) -> None:
        self.success.append(table_name)
        logger.info('Table Scanned: {}'.format(table_name))

-    def filtered(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
-        self.warnings.append(table_name)
+    def filter(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
+        self.filtered.append(table_name)
        logger.warning("Dropped Table {} due to {}".format(table_name, err))


@ -194,7 +195,7 @@ class SQLSource(Source):
        inspector = inspect(self.engine)
        for schema in inspector.get_schema_names():
            if not self.sql_config.filter_pattern.included(schema):
-                self.status.filtered(schema, "Schema pattern not allowed")
+                self.status.filter(schema, "Schema pattern not allowed")
                continue
            logger.debug("total tables {}".format(inspector.get_table_names(schema)))
            if self.config.include_tables:
@ -209,7 +210,7 @@ class SQLSource(Source):
            try:
                schema, table_name = self.standardize_schema_table_names(schema, table_name)
                if not self.sql_config.filter_pattern.included(table_name):
-                    self.status.filtered('{}.{}'.format(self.config.get_service_name(), table_name),
+                    self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name),
                                         "Table pattern not allowed")
                    continue
                self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name))
@ -230,7 +231,7 @@ class SQLSource(Source):
                yield table_and_db
            except ValidationError as err:
                logger.error(err)
-                self.status.filtered('{}.{}'.format(self.config.service_name, table_name),
+                self.status.filter('{}.{}'.format(self.config.service_name, table_name),
                                     "Validation error")
                continue

@ -240,7 +241,7 @@ class SQLSource(Source):
        for view_name in inspector.get_view_names(schema):
            try:
                if not self.sql_config.filter_pattern.included(view_name):
-                    self.status.filtered('{}.{}'.format(self.config.get_service_name(), view_name),
+                    self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name),
                                         "View pattern not allowed")
                    continue
                try:
--- a/ingestion/src/metadata/ingestion/stage/table_usage.py
+++ b/ingestion/src/metadata/ingestion/stage/table_usage.py
@ -32,7 +32,11 @@ def get_table_column_join(table, table_aliases, joins):
    for join in joins:
        try:
            if "." in join:
-                jtable, column = join.split(".")
+                if join.count(".") < 3:
+                    jtable, column = join.split(".")
+                else:
+                    jtable, column = join.split(".")[2:]
+
            if table == jtable or jtable in table_aliases:
                table_column = TableColumn(table=table_aliases[jtable] if jtable in table_aliases else jtable,
                                           column=column)