mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-15 20:57:15 +00:00
fix(bigquery): incorrect lineage when views are present (#4568)
* fix(bigquery): incorrect lineage when views are present Co-authored-by: Ravindra Lanka <rlanka@acryl.io>
This commit is contained in:
parent
3a5cf8eded
commit
5ebb37ab4c
@ -145,15 +145,21 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"airflow": {
|
"airflow": {
|
||||||
"apache-airflow >= 1.10.2",
|
"apache-airflow >= 1.10.2",
|
||||||
},
|
},
|
||||||
"great-expectations": sql_common | {"sqllineage==1.3.3"},
|
"great-expectations": sql_common | {"sqllineage==1.3.4"},
|
||||||
# Source plugins
|
# Source plugins
|
||||||
# PyAthena is pinned with exact version because we use private method in PyAthena
|
# PyAthena is pinned with exact version because we use private method in PyAthena
|
||||||
"athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"},
|
"athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"},
|
||||||
"azure-ad": set(),
|
"azure-ad": set(),
|
||||||
"bigquery": sql_common | bigquery_common | {"sqlalchemy-bigquery>=1.4.1", "sqlparse"},
|
"bigquery": sql_common
|
||||||
|
| bigquery_common
|
||||||
|
| {"sqlalchemy-bigquery>=1.4.1", "sqllineage==1.3.4", "sqlparse"},
|
||||||
"bigquery-usage": bigquery_common | usage_common | {"cachetools"},
|
"bigquery-usage": bigquery_common | usage_common | {"cachetools"},
|
||||||
"clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
"clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
||||||
"clickhouse-usage": sql_common | usage_common | {"clickhouse-sqlalchemy==0.1.8", },
|
"clickhouse-usage": sql_common
|
||||||
|
| usage_common
|
||||||
|
| {
|
||||||
|
"clickhouse-sqlalchemy==0.1.8",
|
||||||
|
},
|
||||||
"datahub-lineage-file": set(),
|
"datahub-lineage-file": set(),
|
||||||
"datahub-business-glossary": set(),
|
"datahub-business-glossary": set(),
|
||||||
"data-lake": {*data_lake_base, *data_lake_profiling},
|
"data-lake": {*data_lake_base, *data_lake_profiling},
|
||||||
@ -181,9 +187,9 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"looker": looker_common,
|
"looker": looker_common,
|
||||||
# lkml>=1.1.2 is required to support the sql_preamble expression in LookML
|
# lkml>=1.1.2 is required to support the sql_preamble expression in LookML
|
||||||
"lookml": looker_common
|
"lookml": looker_common
|
||||||
| {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.3"},
|
| {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.4"},
|
||||||
"metabase": {"requests", "sqllineage==1.3.3"},
|
"metabase": {"requests", "sqllineage==1.3.4"},
|
||||||
"mode": {"requests", "sqllineage==1.3.3", "tenacity>=8.0.1"},
|
"mode": {"requests", "sqllineage==1.3.4", "tenacity>=8.0.1"},
|
||||||
"mongodb": {"pymongo>=3.11", "packaging"},
|
"mongodb": {"pymongo>=3.11", "packaging"},
|
||||||
"mssql": sql_common | {"sqlalchemy-pytds>=0.3"},
|
"mssql": sql_common | {"sqlalchemy-pytds>=0.3"},
|
||||||
"mssql-odbc": sql_common | {"pyodbc"},
|
"mssql-odbc": sql_common | {"pyodbc"},
|
||||||
@ -193,20 +199,24 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"okta": {"okta~=1.7.0"},
|
"okta": {"okta~=1.7.0"},
|
||||||
"oracle": sql_common | {"cx_Oracle"},
|
"oracle": sql_common | {"cx_Oracle"},
|
||||||
"postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"},
|
"postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"},
|
||||||
"redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.3"},
|
"redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.4"},
|
||||||
"redshift": sql_common
|
"redshift": sql_common
|
||||||
| {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage==1.3.3"},
|
| {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage==1.3.4"},
|
||||||
"redshift-usage": sql_common
|
"redshift-usage": sql_common
|
||||||
| usage_common
|
| usage_common
|
||||||
| {
|
| {
|
||||||
"sqlalchemy-redshift",
|
"sqlalchemy-redshift",
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"GeoAlchemy2",
|
"GeoAlchemy2",
|
||||||
"sqllineage==1.3.3",
|
"sqllineage==1.3.4",
|
||||||
},
|
},
|
||||||
"sagemaker": aws_common,
|
"sagemaker": aws_common,
|
||||||
"snowflake": snowflake_common,
|
"snowflake": snowflake_common,
|
||||||
"snowflake-usage": snowflake_common | usage_common | {"more-itertools>=8.12.0", },
|
"snowflake-usage": snowflake_common
|
||||||
|
| usage_common
|
||||||
|
| {
|
||||||
|
"more-itertools>=8.12.0",
|
||||||
|
},
|
||||||
"sqlalchemy": sql_common,
|
"sqlalchemy": sql_common,
|
||||||
"superset": {
|
"superset": {
|
||||||
"requests",
|
"requests",
|
||||||
|
@ -157,6 +157,7 @@ As a SQL-based service, the Athena integration is also supported by our SQL prof
|
|||||||
| `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. |
|
| `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. |
|
||||||
| `lineage_client_project_id` | | None | The project to use when creating the BigQuery Client. If left empty, the required `project_id` will be used. This is helpful in case the default project_id is not used for querying. |
|
| `lineage_client_project_id` | | None | The project to use when creating the BigQuery Client. If left empty, the required `project_id` will be used. This is helpful in case the default project_id is not used for querying. |
|
||||||
| `use_v2_audit_metadata` | | `False` | Whether to use `BigQuery audit logs` to get the lineage or not |
|
| `use_v2_audit_metadata` | | `False` | Whether to use `BigQuery audit logs` to get the lineage or not |
|
||||||
|
| `upstream_lineage_in_report` | | `False` | Useful for debugging lineage information. Set to `True` to see the raw lineage created internally. |
|
||||||
|
|
||||||
|
|
||||||
The following parameters are only relevant if include_table_lineage is set to true:
|
The following parameters are only relevant if include_table_lineage is set to true:
|
||||||
|
@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
|
|||||||
UpstreamClass,
|
UpstreamClass,
|
||||||
UpstreamLineageClass,
|
UpstreamLineageClass,
|
||||||
)
|
)
|
||||||
|
from datahub.utilities.sql_parser import DefaultSQLParser
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -69,7 +70,11 @@ AND
|
|||||||
AND NOT
|
AND NOT
|
||||||
protoPayload.serviceData.jobCompletedEvent.job.jobStatus.error.code:*
|
protoPayload.serviceData.jobCompletedEvent.job.jobStatus.error.code:*
|
||||||
AND
|
AND
|
||||||
protoPayload.serviceData.jobCompletedEvent.job.jobStatistics.referencedTables:*
|
(
|
||||||
|
protoPayload.serviceData.jobCompletedEvent.job.jobStatistics.referencedTables:*
|
||||||
|
OR
|
||||||
|
protoPayload.serviceData.jobCompletedEvent.job.jobStatistics.referencedViews:*
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
AND
|
AND
|
||||||
@ -91,7 +96,11 @@ AND
|
|||||||
AND
|
AND
|
||||||
protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
|
protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
|
||||||
AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
|
AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
|
||||||
AND protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
|
AND (
|
||||||
|
protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
|
||||||
|
OR
|
||||||
|
protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
|
||||||
|
)
|
||||||
)
|
)
|
||||||
AND
|
AND
|
||||||
timestamp >= "{start_time}"
|
timestamp >= "{start_time}"
|
||||||
@ -493,25 +502,57 @@ class BigQuerySource(SQLAlchemySource):
|
|||||||
|
|
||||||
def _create_lineage_map(self, entries: Iterable[QueryEvent]) -> Dict[str, Set[str]]:
|
def _create_lineage_map(self, entries: Iterable[QueryEvent]) -> Dict[str, Set[str]]:
|
||||||
lineage_map: Dict[str, Set[str]] = collections.defaultdict(set)
|
lineage_map: Dict[str, Set[str]] = collections.defaultdict(set)
|
||||||
num_entries: int = 0
|
self.report.num_total_lineage_entries = 0
|
||||||
num_skipped_entries: int = 0
|
self.report.num_skipped_lineage_entries_missing_data = 0
|
||||||
|
self.report.num_skipped_lineage_entries_not_allowed = 0
|
||||||
|
self.report.num_skipped_lineage_entries_other = 0
|
||||||
for e in entries:
|
for e in entries:
|
||||||
num_entries += 1
|
self.report.num_total_lineage_entries += 1
|
||||||
if e.destinationTable is None or not e.referencedTables:
|
if e.destinationTable is None or not (
|
||||||
num_skipped_entries += 1
|
e.referencedTables or e.referencedViews
|
||||||
|
):
|
||||||
|
self.report.num_skipped_lineage_entries_missing_data += 1
|
||||||
continue
|
continue
|
||||||
entry_consumed: bool = False
|
# Skip if schema/table pattern don't allow the destination table
|
||||||
|
destination_table_str = str(e.destinationTable.remove_extras())
|
||||||
|
destination_table_str_parts = destination_table_str.split("/")
|
||||||
|
if not self.config.schema_pattern.allowed(
|
||||||
|
destination_table_str_parts[3]
|
||||||
|
) or not self.config.table_pattern.allowed(destination_table_str_parts[-1]):
|
||||||
|
self.report.num_skipped_lineage_entries_not_allowed += 1
|
||||||
|
continue
|
||||||
|
has_table = False
|
||||||
for ref_table in e.referencedTables:
|
for ref_table in e.referencedTables:
|
||||||
destination_table_str = str(e.destinationTable.remove_extras())
|
|
||||||
ref_table_str = str(ref_table.remove_extras())
|
ref_table_str = str(ref_table.remove_extras())
|
||||||
if ref_table_str != destination_table_str:
|
if ref_table_str != destination_table_str:
|
||||||
lineage_map[destination_table_str].add(ref_table_str)
|
lineage_map[destination_table_str].add(ref_table_str)
|
||||||
entry_consumed = True
|
has_table = True
|
||||||
if not entry_consumed:
|
has_view = False
|
||||||
num_skipped_entries += 1
|
for ref_view in e.referencedViews:
|
||||||
logger.info(
|
ref_view_str = str(ref_view.remove_extras())
|
||||||
f"Creating lineage map: total number of entries={num_entries}, number skipped={num_skipped_entries}."
|
if ref_view_str != destination_table_str:
|
||||||
)
|
lineage_map[destination_table_str].add(ref_view_str)
|
||||||
|
has_view = True
|
||||||
|
if has_table and has_view:
|
||||||
|
# If there is a view being referenced then bigquery sends both the view as well as underlying table
|
||||||
|
# in the references. There is no distinction between direct/base objects accessed. So doing sql parsing
|
||||||
|
# to ensure we only use direct objects accessed for lineage
|
||||||
|
parser = DefaultSQLParser(e.query)
|
||||||
|
referenced_objs = set(
|
||||||
|
map(lambda x: x.split(".")[-1], parser.get_tables())
|
||||||
|
)
|
||||||
|
curr_lineage_str = lineage_map[destination_table_str]
|
||||||
|
new_lineage_str = set()
|
||||||
|
for lineage_str in curr_lineage_str:
|
||||||
|
name = lineage_str.split("/")[-1]
|
||||||
|
if name in referenced_objs:
|
||||||
|
new_lineage_str.add(lineage_str)
|
||||||
|
lineage_map[destination_table_str] = new_lineage_str
|
||||||
|
if not (has_table or has_view):
|
||||||
|
self.report.num_skipped_lineage_entries_other += 1
|
||||||
|
|
||||||
|
if self.config.upstream_lineage_in_report:
|
||||||
|
self.report.upstream_lineage = lineage_map
|
||||||
return lineage_map
|
return lineage_map
|
||||||
|
|
||||||
def get_latest_partition(
|
def get_latest_partition(
|
||||||
@ -688,7 +729,7 @@ WHERE
|
|||||||
for ref_table in self.lineage_metadata[str(bq_table)]:
|
for ref_table in self.lineage_metadata[str(bq_table)]:
|
||||||
upstream_table = BigQueryTableRef.from_string_name(ref_table)
|
upstream_table = BigQueryTableRef.from_string_name(ref_table)
|
||||||
if upstream_table.is_temporary_table():
|
if upstream_table.is_temporary_table():
|
||||||
# making sure we don't process a table twice and not get into a recurisve loop
|
# making sure we don't process a table twice and not get into a recursive loop
|
||||||
if ref_table in tables_seen:
|
if ref_table in tables_seen:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Skipping table {ref_table} because it was seen already"
|
f"Skipping table {ref_table} because it was seen already"
|
||||||
|
@ -4,7 +4,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, Iterable, List, MutableMapping, Optional, Union, cast
|
from typing import Any, Dict, Iterable, List, MutableMapping, Optional, Union, cast
|
||||||
|
|
||||||
@ -300,14 +300,13 @@ class QueryEvent:
|
|||||||
|
|
||||||
timestamp: datetime
|
timestamp: datetime
|
||||||
actor_email: str
|
actor_email: str
|
||||||
|
|
||||||
query: str
|
query: str
|
||||||
statementType: Optional[str]
|
statementType: Optional[str] = None
|
||||||
destinationTable: Optional[BigQueryTableRef]
|
destinationTable: Optional[BigQueryTableRef] = None
|
||||||
referencedTables: Optional[List[BigQueryTableRef]]
|
referencedTables: List[BigQueryTableRef] = field(default_factory=list)
|
||||||
jobName: Optional[str]
|
referencedViews: List[BigQueryTableRef] = field(default_factory=list)
|
||||||
|
jobName: Optional[str] = None
|
||||||
payload: Any
|
payload: Optional[Dict] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_missing_key_entry(entry: AuditLogEntry) -> Optional[str]:
|
def get_missing_key_entry(entry: AuditLogEntry) -> Optional[str]:
|
||||||
@ -323,48 +322,49 @@ class QueryEvent:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_entry(cls, entry: AuditLogEntry) -> "QueryEvent":
|
def from_entry(cls, entry: AuditLogEntry) -> "QueryEvent":
|
||||||
user = entry.payload["authenticationInfo"]["principalEmail"]
|
job: Dict = entry.payload["serviceData"]["jobCompletedEvent"]["job"]
|
||||||
|
job_query_conf: Dict = job["jobConfiguration"]["query"]
|
||||||
job = entry.payload["serviceData"]["jobCompletedEvent"]["job"]
|
# basic query_event
|
||||||
jobName = _job_name_ref(
|
query_event = QueryEvent(
|
||||||
|
timestamp=entry.timestamp,
|
||||||
|
actor_email=entry.payload["authenticationInfo"]["principalEmail"],
|
||||||
|
query=job_query_conf["query"],
|
||||||
|
)
|
||||||
|
# jobName
|
||||||
|
query_event.jobName = _job_name_ref(
|
||||||
job.get("jobName", {}).get("projectId"), job.get("jobName", {}).get("jobId")
|
job.get("jobName", {}).get("projectId"), job.get("jobName", {}).get("jobId")
|
||||||
)
|
)
|
||||||
rawQuery = job["jobConfiguration"]["query"]["query"]
|
# destinationTable
|
||||||
|
raw_dest_table = job_query_conf.get("destinationTable")
|
||||||
rawDestTable = job["jobConfiguration"]["query"]["destinationTable"]
|
if raw_dest_table:
|
||||||
destinationTable = None
|
query_event.destinationTable = BigQueryTableRef.from_spec_obj(
|
||||||
if rawDestTable:
|
raw_dest_table
|
||||||
destinationTable = BigQueryTableRef.from_spec_obj(rawDestTable)
|
)
|
||||||
|
# statementType
|
||||||
try:
|
query_event.statementType = job_query_conf.get("statementType")
|
||||||
statementType = job["jobConfiguration"]["query"]["statementType"]
|
# referencedTables
|
||||||
except KeyError:
|
job_stats: Dict = job["jobStatistics"]
|
||||||
statementType = None
|
raw_ref_tables = job_stats.get("referencedTables")
|
||||||
|
if raw_ref_tables:
|
||||||
rawRefTables = job["jobStatistics"].get("referencedTables")
|
query_event.referencedTables = [
|
||||||
referencedTables = None
|
BigQueryTableRef.from_spec_obj(spec) for spec in raw_ref_tables
|
||||||
if rawRefTables:
|
|
||||||
referencedTables = [
|
|
||||||
BigQueryTableRef.from_spec_obj(spec) for spec in rawRefTables
|
|
||||||
]
|
]
|
||||||
|
# referencedViews
|
||||||
|
raw_ref_views = job_stats.get("referencedViews")
|
||||||
|
if raw_ref_views:
|
||||||
|
query_event.referencedViews = [
|
||||||
|
BigQueryTableRef.from_spec_obj(spec) for spec in raw_ref_views
|
||||||
|
]
|
||||||
|
# payload
|
||||||
|
query_event.payload = entry.payload if DEBUG_INCLUDE_FULL_PAYLOADS else None
|
||||||
|
|
||||||
queryEvent = QueryEvent(
|
if not query_event.jobName:
|
||||||
timestamp=entry.timestamp,
|
|
||||||
actor_email=user,
|
|
||||||
query=rawQuery,
|
|
||||||
statementType=statementType,
|
|
||||||
destinationTable=destinationTable,
|
|
||||||
referencedTables=referencedTables,
|
|
||||||
jobName=jobName,
|
|
||||||
payload=entry.payload if DEBUG_INCLUDE_FULL_PAYLOADS else None,
|
|
||||||
)
|
|
||||||
if not jobName:
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"jobName from query events is absent. "
|
"jobName from query events is absent. "
|
||||||
"Auditlog entry - {logEntry}".format(logEntry=entry)
|
"Auditlog entry - {logEntry}".format(logEntry=entry)
|
||||||
)
|
)
|
||||||
|
|
||||||
return queryEvent
|
return query_event
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_missing_key_exported_bigquery_audit_metadata(
|
def get_missing_key_exported_bigquery_audit_metadata(
|
||||||
@ -376,45 +376,44 @@ class QueryEvent:
|
|||||||
def from_exported_bigquery_audit_metadata(
|
def from_exported_bigquery_audit_metadata(
|
||||||
cls, row: BigQueryAuditMetadata
|
cls, row: BigQueryAuditMetadata
|
||||||
) -> "QueryEvent":
|
) -> "QueryEvent":
|
||||||
timestamp = row["timestamp"]
|
|
||||||
payload = row["protoPayload"]
|
|
||||||
metadata = json.loads(row["metadata"])
|
|
||||||
|
|
||||||
user = payload["authenticationInfo"]["principalEmail"]
|
payload: Dict = row["protoPayload"]
|
||||||
|
metadata: Dict = json.loads(row["metadata"])
|
||||||
job = metadata["jobChange"]["job"]
|
job: Dict = metadata["jobChange"]["job"]
|
||||||
|
query_config: Dict = job["jobConfig"]["queryConfig"]
|
||||||
job_name = job.get("jobName")
|
# basic query_event
|
||||||
raw_query = job["jobConfig"]["queryConfig"]["query"]
|
query_event = QueryEvent(
|
||||||
|
timestamp=row["timestamp"],
|
||||||
raw_dest_table = job["jobConfig"]["queryConfig"].get("destinationTable")
|
actor_email=payload["authenticationInfo"]["principalEmail"],
|
||||||
destination_table = None
|
query=query_config["query"],
|
||||||
|
)
|
||||||
|
# jobName
|
||||||
|
query_event.jobName = job.get("jobName")
|
||||||
|
# destinationTable
|
||||||
|
raw_dest_table = query_config.get("destinationTable")
|
||||||
if raw_dest_table:
|
if raw_dest_table:
|
||||||
destination_table = BigQueryTableRef.from_string_name(raw_dest_table)
|
query_event.destinationTable = BigQueryTableRef.from_string_name(
|
||||||
|
raw_dest_table
|
||||||
raw_ref_tables = job["jobStats"]["queryStats"].get("referencedTables")
|
)
|
||||||
referenced_tables = None
|
# referencedTables
|
||||||
|
query_stats: Dict = job["jobStats"]["queryStats"]
|
||||||
|
raw_ref_tables = query_stats.get("referencedTables")
|
||||||
if raw_ref_tables:
|
if raw_ref_tables:
|
||||||
referenced_tables = [
|
query_event.referencedTables = [
|
||||||
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_tables
|
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_tables
|
||||||
]
|
]
|
||||||
|
# referencedViews
|
||||||
|
raw_ref_views = query_stats.get("referencedViews")
|
||||||
|
if raw_ref_views:
|
||||||
|
query_event.referencedViews = [
|
||||||
|
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_views
|
||||||
|
]
|
||||||
|
# statementType
|
||||||
|
query_event.statementType = query_config.get("statementType")
|
||||||
|
# payload
|
||||||
|
query_event.payload = payload if DEBUG_INCLUDE_FULL_PAYLOADS else None
|
||||||
|
|
||||||
try:
|
if not query_event.jobName:
|
||||||
statementType = job["jobConfiguration"]["query"]["statementType"]
|
|
||||||
except KeyError:
|
|
||||||
statementType = None
|
|
||||||
|
|
||||||
query_event = QueryEvent(
|
|
||||||
timestamp=timestamp,
|
|
||||||
actor_email=user,
|
|
||||||
query=raw_query,
|
|
||||||
statementType=statementType,
|
|
||||||
destinationTable=destination_table,
|
|
||||||
referencedTables=referenced_tables,
|
|
||||||
jobName=job_name,
|
|
||||||
payload=payload if DEBUG_INCLUDE_FULL_PAYLOADS else None,
|
|
||||||
)
|
|
||||||
if not job_name:
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"jobName from query events is absent. "
|
"jobName from query events is absent. "
|
||||||
"BigQueryAuditMetadata entry - {logEntry}".format(logEntry=row)
|
"BigQueryAuditMetadata entry - {logEntry}".format(logEntry=row)
|
||||||
@ -424,45 +423,42 @@ class QueryEvent:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_entry_v2(cls, row: BigQueryAuditMetadata) -> "QueryEvent":
|
def from_entry_v2(cls, row: BigQueryAuditMetadata) -> "QueryEvent":
|
||||||
timestamp = row.timestamp
|
payload: Dict = row.payload
|
||||||
payload = row.payload
|
metadata: Dict = payload["metadata"]
|
||||||
metadata = payload["metadata"]
|
job: Dict = metadata["jobChange"]["job"]
|
||||||
|
query_config: Dict = job["jobConfig"]["queryConfig"]
|
||||||
user = payload["authenticationInfo"]["principalEmail"]
|
# basic query_event
|
||||||
|
query_event = QueryEvent(
|
||||||
job = metadata["jobChange"]["job"]
|
timestamp=row.timestamp,
|
||||||
|
actor_email=payload["authenticationInfo"]["principalEmail"],
|
||||||
job_name = job.get("jobName")
|
query=query_config["query"],
|
||||||
raw_query = job["jobConfig"]["queryConfig"]["query"]
|
)
|
||||||
|
query_event.jobName = job.get("jobName")
|
||||||
raw_dest_table = job["jobConfig"]["queryConfig"].get("destinationTable")
|
# destinationTable
|
||||||
destination_table = None
|
raw_dest_table = query_config.get("destinationTable")
|
||||||
if raw_dest_table:
|
if raw_dest_table:
|
||||||
destination_table = BigQueryTableRef.from_string_name(raw_dest_table)
|
query_event.destinationTable = BigQueryTableRef.from_string_name(
|
||||||
|
raw_dest_table
|
||||||
raw_ref_tables = job["jobStats"]["queryStats"].get("referencedTables")
|
)
|
||||||
referenced_tables = None
|
# statementType
|
||||||
|
query_event.statementType = query_config.get("statementType")
|
||||||
|
# referencedTables
|
||||||
|
query_stats: Dict = job["jobStats"]["queryStats"]
|
||||||
|
raw_ref_tables = query_stats.get("referencedTables")
|
||||||
if raw_ref_tables:
|
if raw_ref_tables:
|
||||||
referenced_tables = [
|
query_event.referencedTables = [
|
||||||
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_tables
|
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_tables
|
||||||
]
|
]
|
||||||
|
# referencedViews
|
||||||
|
raw_ref_views = query_stats.get("referencedViews")
|
||||||
|
if raw_ref_views:
|
||||||
|
query_event.referencedViews = [
|
||||||
|
BigQueryTableRef.from_string_name(spec) for spec in raw_ref_views
|
||||||
|
]
|
||||||
|
# payload
|
||||||
|
query_event.payload = payload if DEBUG_INCLUDE_FULL_PAYLOADS else None
|
||||||
|
|
||||||
try:
|
if not query_event.jobName:
|
||||||
statementType = job["jobConfig"]["queryConfig"]["statementType"]
|
|
||||||
except KeyError:
|
|
||||||
statementType = None
|
|
||||||
|
|
||||||
query_event = QueryEvent(
|
|
||||||
timestamp=timestamp,
|
|
||||||
actor_email=user,
|
|
||||||
query=raw_query,
|
|
||||||
statementType=statementType,
|
|
||||||
destinationTable=destination_table,
|
|
||||||
referencedTables=referenced_tables,
|
|
||||||
jobName=job_name,
|
|
||||||
payload=payload if DEBUG_INCLUDE_FULL_PAYLOADS else None,
|
|
||||||
)
|
|
||||||
if not job_name:
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"jobName from query events is absent. "
|
"jobName from query events is absent. "
|
||||||
"BigQueryAuditMetadata entry - {logEntry}".format(logEntry=row)
|
"BigQueryAuditMetadata entry - {logEntry}".format(logEntry=row)
|
||||||
@ -511,17 +507,22 @@ class BigQueryUsageSource(Source):
|
|||||||
Iterable[MetadataWorkUnit], last_updated_work_units_uncasted
|
Iterable[MetadataWorkUnit], last_updated_work_units_uncasted
|
||||||
)
|
)
|
||||||
if self.config.include_operational_stats:
|
if self.config.include_operational_stats:
|
||||||
|
self.report.num_operational_stats_workunits_emitted = 0
|
||||||
for wu in last_updated_work_units:
|
for wu in last_updated_work_units:
|
||||||
self.report.report_workunit(wu)
|
self.report.report_workunit(wu)
|
||||||
yield wu
|
yield wu
|
||||||
|
self.report.num_operational_stats_workunits_emitted += 1
|
||||||
|
|
||||||
hydrated_read_events = self._join_events_by_job_id(parsed_events)
|
hydrated_read_events = self._join_events_by_job_id(parsed_events)
|
||||||
aggregated_info = self._aggregate_enriched_read_events(hydrated_read_events)
|
aggregated_info = self._aggregate_enriched_read_events(hydrated_read_events)
|
||||||
|
|
||||||
|
self.report.num_usage_workunits_emitted = 0
|
||||||
for time_bucket in aggregated_info.values():
|
for time_bucket in aggregated_info.values():
|
||||||
for aggregate in time_bucket.values():
|
for aggregate in time_bucket.values():
|
||||||
wu = self._make_usage_stat(aggregate)
|
wu = self._make_usage_stat(aggregate)
|
||||||
self.report.report_workunit(wu)
|
self.report.report_workunit(wu)
|
||||||
yield wu
|
yield wu
|
||||||
|
self.report.num_usage_workunits_emitted += 1
|
||||||
|
|
||||||
def _make_bigquery_clients(self) -> List[GCPLoggingClient]:
|
def _make_bigquery_clients(self) -> List[GCPLoggingClient]:
|
||||||
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
||||||
|
@ -17,20 +17,19 @@ class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
|
|||||||
scheme: str = "bigquery"
|
scheme: str = "bigquery"
|
||||||
project_id: Optional[str] = None
|
project_id: Optional[str] = None
|
||||||
lineage_client_project_id: Optional[str] = None
|
lineage_client_project_id: Optional[str] = None
|
||||||
|
|
||||||
log_page_size: Optional[pydantic.PositiveInt] = 1000
|
log_page_size: Optional[pydantic.PositiveInt] = 1000
|
||||||
credential: Optional[BigQueryCredential]
|
credential: Optional[BigQueryCredential]
|
||||||
# extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
|
# extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
|
||||||
extra_client_options: Dict[str, Any] = {}
|
extra_client_options: Dict[str, Any] = {}
|
||||||
include_table_lineage: Optional[bool] = True
|
include_table_lineage: Optional[bool] = True
|
||||||
max_query_duration: timedelta = timedelta(minutes=15)
|
max_query_duration: timedelta = timedelta(minutes=15)
|
||||||
|
|
||||||
credentials_path: Optional[str] = None
|
credentials_path: Optional[str] = None
|
||||||
bigquery_audit_metadata_datasets: Optional[List[str]] = None
|
bigquery_audit_metadata_datasets: Optional[List[str]] = None
|
||||||
use_exported_bigquery_audit_metadata: bool = False
|
use_exported_bigquery_audit_metadata: bool = False
|
||||||
use_date_sharded_audit_log_tables: bool = False
|
use_date_sharded_audit_log_tables: bool = False
|
||||||
_credentials_path: Optional[str] = pydantic.PrivateAttr(None)
|
_credentials_path: Optional[str] = pydantic.PrivateAttr(None)
|
||||||
use_v2_audit_metadata: Optional[bool] = False
|
use_v2_audit_metadata: Optional[bool] = False
|
||||||
|
upstream_lineage_in_report: bool = False
|
||||||
|
|
||||||
def __init__(self, **data: Any):
|
def __init__(self, **data: Any):
|
||||||
super().__init__(**data)
|
super().__init__(**data)
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Dict, Optional
|
||||||
|
|
||||||
from datahub.ingestion.source.sql.sql_common import SQLSourceReport
|
from datahub.ingestion.source.sql.sql_common import SQLSourceReport
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BigQueryReport(SQLSourceReport):
|
class BigQueryReport(SQLSourceReport):
|
||||||
|
num_total_lineage_entries: Optional[int] = None
|
||||||
|
num_skipped_lineage_entries_missing_data: Optional[int] = None
|
||||||
|
num_skipped_lineage_entries_not_allowed: Optional[int] = None
|
||||||
|
num_skipped_lineage_entries_other: Optional[int] = None
|
||||||
num_total_log_entries: Optional[int] = None
|
num_total_log_entries: Optional[int] = None
|
||||||
num_parsed_log_entires: Optional[int] = None
|
num_parsed_log_entires: Optional[int] = None
|
||||||
num_total_audit_entries: Optional[int] = None
|
num_total_audit_entries: Optional[int] = None
|
||||||
@ -20,3 +24,4 @@ class BigQueryReport(SQLSourceReport):
|
|||||||
log_entry_end_time: Optional[str] = None
|
log_entry_end_time: Optional[str] = None
|
||||||
audit_start_time: Optional[str] = None
|
audit_start_time: Optional[str] = None
|
||||||
audit_end_time: Optional[str] = None
|
audit_end_time: Optional[str] = None
|
||||||
|
upstream_lineage: Dict = field(default_factory=dict)
|
||||||
|
@ -21,6 +21,8 @@ class BigQueryUsageSourceReport(SourceReport):
|
|||||||
deny_pattern: Optional[str] = None
|
deny_pattern: Optional[str] = None
|
||||||
log_entry_start_time: Optional[str] = None
|
log_entry_start_time: Optional[str] = None
|
||||||
log_entry_end_time: Optional[str] = None
|
log_entry_end_time: Optional[str] = None
|
||||||
|
num_usage_workunits_emitted: Optional[int] = None
|
||||||
|
num_operational_stats_workunits_emitted: Optional[int] = None
|
||||||
|
|
||||||
def report_dropped(self, key: str) -> None:
|
def report_dropped(self, key: str) -> None:
|
||||||
self.dropped_table[key] += 1
|
self.dropped_table[key] += 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user