fix(ingestion): bigquery - extract temp table prefix as config, fix reporting, logging (#4766)

This commit is contained in:
Aseem Bansal 2022-05-05 13:32:10 +05:30 committed by GitHub
parent 040e72af6b
commit bddfd89002
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 29 additions and 12 deletions

View File

@ -357,6 +357,7 @@ class BigQuerySource(SQLAlchemySource):
logger.info(
f"Built lineage map containing {len(self.lineage_metadata)} entries."
)
logger.debug(f"lineage metadata is {self.lineage_metadata}")
def _compute_bigquery_lineage_via_gcp_logging(
self, lineage_client_project_id: Optional[str]
@ -631,9 +632,6 @@ class BigQuerySource(SQLAlchemySource):
lineage_map[destination_table_str] = new_lineage_str
if not (has_table or has_view):
self.report.num_skipped_lineage_entries_other += 1
if self.config.upstream_lineage_in_report:
self.report.upstream_lineage = lineage_map
return lineage_map
def get_latest_partition(
@ -823,7 +821,7 @@ WHERE
assert self.lineage_metadata
for ref_table in self.lineage_metadata[str(bq_table)]:
upstream_table = BigQueryTableRef.from_string_name(ref_table)
if upstream_table.is_temporary_table():
if upstream_table.is_temporary_table(self.config.temp_table_dataset_prefix):
# making sure we don't process a table twice and not get into a recursive loop
if ref_table in tables_seen:
logger.debug(
@ -843,9 +841,11 @@ WHERE
self, dataset_urn: str
) -> Optional[MetadataChangeProposalWrapper]:
if self.lineage_metadata is None:
logger.debug("No lineage metadata so skipping getting mcp")
return None
dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(dataset_urn)
if dataset_key is None:
logger.debug(f"No dataset_key for {dataset_urn} so skipping getting mcp")
return None
project_id, dataset_name, tablename = dataset_key.name.split(".")
bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
@ -869,6 +869,12 @@ WHERE
),
DatasetLineageTypeClass.TRANSFORMED,
)
if self.config.upstream_lineage_in_report:
current_lineage_map: Set = self.report.upstream_lineage.get(
str(bq_table), set()
)
current_lineage_map.add(str(upstream_table))
self.report.upstream_lineage[str(bq_table)] = current_lineage_map
upstream_list.append(upstream_table_class)
if upstream_list:

View File

@ -260,9 +260,9 @@ class BigQueryTableRef:
raise ValueError(f"invalid BigQuery table reference: {ref}")
return cls(parts[1], parts[3], parts[5])
def is_temporary_table(self) -> bool:
def is_temporary_table(self, prefix: str) -> bool:
# Temporary tables will have a dataset that begins with an underscore.
return self.dataset.startswith("_")
return self.dataset.startswith(prefix)
def remove_extras(self) -> "BigQueryTableRef":
# Handle partitioned and sharded tables.
@ -971,7 +971,8 @@ class BigQueryUsageSource(Source):
) -> Iterable[Union[ReadEvent, QueryEvent, MetadataWorkUnit]]:
self.report.num_read_events = 0
self.report.num_query_events = 0
self.report.num_filtered_events = 0
self.report.num_filtered_read_events = 0
self.report.num_filtered_query_events = 0
for entry in entries:
event: Optional[Union[ReadEvent, QueryEvent]] = None
@ -979,7 +980,7 @@ class BigQueryUsageSource(Source):
if missing_read_entry is None:
event = ReadEvent.from_entry(entry)
if not self._is_table_allowed(event.resource):
self.report.num_filtered_events += 1
self.report.num_filtered_read_events += 1
continue
self.report.num_read_events += 1
@ -987,7 +988,7 @@ class BigQueryUsageSource(Source):
if event is None and missing_query_entry is None:
event = QueryEvent.from_entry(entry)
if not self._is_table_allowed(event.destinationTable):
self.report.num_filtered_events += 1
self.report.num_filtered_query_events += 1
continue
self.report.num_query_events += 1
wu = self._create_operation_aspect_work_unit(event)
@ -999,7 +1000,7 @@ class BigQueryUsageSource(Source):
if event is None and missing_query_entry_v2 is None:
event = QueryEvent.from_entry_v2(entry)
if not self._is_table_allowed(event.destinationTable):
self.report.num_filtered_events += 1
self.report.num_filtered_query_events += 1
continue
self.report.num_query_events += 1
wu = self._create_operation_aspect_work_unit(event)
@ -1013,6 +1014,7 @@ class BigQueryUsageSource(Source):
f"Unable to parse {type(entry)} missing read {missing_query_entry}, missing query {missing_query_entry} missing v2 {missing_query_entry_v2} for {entry}",
)
else:
logger.debug(f"Yielding {event} from log entries")
yield event
logger.info(
@ -1138,7 +1140,7 @@ class BigQueryUsageSource(Source):
logger.warning(f"Failed to process event {str(event.resource)}", e)
continue
if resource.is_temporary_table():
if resource.is_temporary_table(self.config.temp_table_dataset_prefix):
logger.debug(f"Dropping temporary table {resource}")
self.report.report_dropped(str(resource))
continue

View File

@ -56,6 +56,10 @@ class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.",
)
_credentials_path: Optional[str] = pydantic.PrivateAttr(None)
temp_table_dataset_prefix: str = pydantic.Field(
default="_",
description="If you are creating temp tables in a dataset with a particular prefix you can use this config to set the prefix for the dataset. This is to support workflows from before bigquery's introduction of temp tables. By default we use `_` because of datasets that begin with an underscore are hidden by default https://cloud.google.com/bigquery/docs/datasets#dataset-naming.",
)
use_v2_audit_metadata: Optional[bool] = pydantic.Field(
default=False, description="Whether to ingest logs using the v2 format."
)

View File

@ -116,6 +116,10 @@ class BigQueryUsageConfig(DatasetSourceConfigBase, BaseUsageConfig):
description="Bigquery credential. Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set. See this example recipe for details",
)
_credentials_path: Optional[str] = pydantic.PrivateAttr(None)
temp_table_dataset_prefix: str = pydantic.Field(
default="_",
description="If you are creating temp tables in a dataset with a particular prefix you can use this config to set the prefix for the dataset. This is to support workflows from before bigquery's introduction of temp tables. By default we use `_` because of datasets that begin with an underscore are hidden by default https://cloud.google.com/bigquery/docs/datasets#dataset-naming.",
)
def __init__(self, **data: Any):
super().__init__(**data)

View File

@ -11,7 +11,8 @@ class BigQueryUsageSourceReport(SourceReport):
dropped_table: Counter[str] = dataclasses.field(default_factory=collections.Counter)
total_log_entries: Optional[int] = None
num_read_events: Optional[int] = None
num_filtered_events: Optional[int] = None
num_filtered_read_events: Optional[int] = None
num_filtered_query_events: Optional[int] = None
num_query_events: Optional[int] = None
use_v2_audit_metadata: Optional[bool] = None
log_page_size: Optional[int] = None