Fix #1618: Fix usage query parsing (#1619)

This commit is contained in:
Sriharsha Chintalapani 2021-12-07 23:53:53 -08:00 committed by GitHub
parent 67433f559c
commit f33a08377c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 22 deletions

View File

@ -22,7 +22,8 @@
"bulk_sink": { "bulk_sink": {
"type": "metadata-usage", "type": "metadata-usage",
"config": { "config": {
"filename": "/tmp/sample_usage" "filename": "/tmp/sample_usage",
"service_name": "bigquery_gcp"
} }
}, },
"metadata_server": { "metadata_server": {

View File

@ -33,6 +33,7 @@ logger = logging.getLogger(__name__)
class MetadataUsageSinkConfig(ConfigModel): class MetadataUsageSinkConfig(ConfigModel):
filename: str filename: str
service_name: str
class MetadataUsageBulkSink(BulkSink): class MetadataUsageBulkSink(BulkSink):
@ -51,17 +52,9 @@ class MetadataUsageBulkSink(BulkSink):
self.file_handler = open(self.config.filename, "r") self.file_handler = open(self.config.filename, "r")
self.metadata = OpenMetadata(self.metadata_config) self.metadata = OpenMetadata(self.metadata_config)
self.status = BulkSinkStatus() self.status = BulkSinkStatus()
self.tables_dict = {}
self.table_join_dict = {} self.table_join_dict = {}
self.__map_tables()
self.today = datetime.today().strftime("%Y-%m-%d") self.today = datetime.today().strftime("%Y-%m-%d")
def __map_tables(self):
table_entities = self.metadata.list_entities(entity=Table)
for table in table_entities.entities:
if table.name.__root__ not in self.tables_dict.keys():
self.tables_dict[table.name.__root__] = table
@classmethod @classmethod
def create( def create(
cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext
@ -82,8 +75,10 @@ class MetadataUsageBulkSink(BulkSink):
table_usage = TableUsageCount(**json.loads(record)) table_usage = TableUsageCount(**json.loads(record))
if "." in table_usage.table: if "." in table_usage.table:
table_usage.table = table_usage.table.split(".")[1] table_usage.table = table_usage.table.split(".")[1]
if table_usage.table in self.tables_dict: table_entity = self.__get_table_entity(
table_entity = self.tables_dict[table_usage.table] table_usage.database, table_usage.table
)
if table_entity is not None:
table_usage_request = TableUsageRequest( table_usage_request = TableUsageRequest(
date=table_usage.date, count=table_usage.count date=table_usage.date, count=table_usage.count
) )
@ -96,7 +91,6 @@ class MetadataUsageBulkSink(BulkSink):
table_usage.table, err table_usage.table, err
) )
) )
table_join_request = self.__get_table_joins(table_usage) table_join_request = self.__get_table_joins(table_usage)
logger.debug("table join request {}".format(table_join_request)) logger.debug("table join request {}".format(table_join_request))
try: try:
@ -130,20 +124,21 @@ class MetadataUsageBulkSink(BulkSink):
def __get_table_joins(self, table_usage): def __get_table_joins(self, table_usage):
table_joins: TableJoins = TableJoins(columnJoins=[], startDate=table_usage.date) table_joins: TableJoins = TableJoins(columnJoins=[], startDate=table_usage.date)
column_joins_dict = {} column_joins_dict = {}
joined_with = {}
for column_join in table_usage.joins: for column_join in table_usage.joins:
joined_with = {}
if column_join.table_column is None or len(column_join.joined_with) == 0: if column_join.table_column is None or len(column_join.joined_with) == 0:
continue continue
if column_join.table_column.column in column_joins_dict.keys(): if column_join.table_column.column in column_joins_dict.keys():
joined_with = column_joins_dict[column_join.table_column.column] joined_with = column_joins_dict[column_join.table_column.column]
else: else:
column_joins_dict[column_join.table_column.column] = {} column_joins_dict[column_join.table_column.column] = {}
main_column_fqdn = self.__get_column_fqdn(
main_column_fqdn = self.__get_column_fqdn(column_join.table_column) table_usage.database, column_join.table_column
)
for column in column_join.joined_with: for column in column_join.joined_with:
joined_column_fqdn = self.__get_column_fqdn(column) joined_column_fqdn = self.__get_column_fqdn(
table_usage.database, column
)
if joined_column_fqdn in joined_with.keys(): if joined_column_fqdn in joined_with.keys():
column_joined_with = joined_with[joined_column_fqdn] column_joined_with = joined_with[joined_column_fqdn]
column_joined_with.joinCount += 1 column_joined_with.joinCount += 1
@ -153,7 +148,9 @@ class MetadataUsageBulkSink(BulkSink):
fullyQualifiedName=joined_column_fqdn, joinCount=1 fullyQualifiedName=joined_column_fqdn, joinCount=1
) )
else: else:
logger.info("Skipping join columns for {}".format(column)) logger.info(
f"Skipping join columns for {column} {joined_column_fqdn}"
)
column_joins_dict[column_join.table_column.column] = joined_with column_joins_dict[column_join.table_column.column] = joined_with
for key, value in column_joins_dict.items(): for key, value in column_joins_dict.items():
@ -162,14 +159,19 @@ class MetadataUsageBulkSink(BulkSink):
) )
return table_joins return table_joins
def __get_column_fqdn(self, table_column: TableColumn): def __get_column_fqdn(self, database: str, table_column: TableColumn):
if table_column.table not in self.tables_dict: table_entity = self.__get_table_entity(database, table_column.table)
if table_entity is None:
return None return None
table_entity = self.tables_dict[table_column.table]
for tbl_column in table_entity.columns: for tbl_column in table_entity.columns:
if table_column.column.lower() == tbl_column.name.__root__.lower(): if table_column.column.lower() == tbl_column.name.__root__.lower():
return tbl_column.fullyQualifiedName.__root__ return tbl_column.fullyQualifiedName.__root__
def __get_table_entity(self, database_name: str, table_name: str) -> Table:
table_fqn = f"{self.config.service_name}.{database_name}.{table_name}"
table_entity = self.metadata.get_by_name(Table, fqdn=table_fqn)
return table_entity
def get_status(self): def get_status(self):
return self.status return self.status