feat(ingest): great-expectations - add more logs (#4832)

This commit is contained in:
mayurinehate 2022-05-05 21:55:58 +05:30 committed by GitHub
parent 3775e79d47
commit b2c82dcaa3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -131,7 +131,7 @@ class DataHubValidationAction(ValidationAction):
datasets = self.get_dataset_partitions(batch_identifier, data_asset)
if len(datasets) == 0 or datasets[0]["dataset_urn"] is None:
logger.info("Metadata not sent to datahub. No datasets found.")
warn("Metadata not sent to datahub. No datasets found.")
return {"datahub_notification_result": "none required"}
# Returns assertion info and assertion results
@ -143,7 +143,15 @@ class DataHubValidationAction(ValidationAction):
datasets,
)
logger.info("Sending metadata to datahub ...")
logger.info("Dataset URN - {urn}".format(urn=datasets[0]["dataset_urn"]))
for assertion in assertions:
logger.info(
"Assertion URN - {urn}".format(urn=assertion["assertionUrn"])
)
# Construct a MetadataChangeProposalWrapper object.
assertion_info_mcp = MetadataChangeProposalWrapper(
entityType="assertion",
@ -175,7 +183,7 @@ class DataHubValidationAction(ValidationAction):
# Emit Result! (timseries aspect)
emitter.emit_mcp(dataset_assertionResult_mcp)
logger.info("Metadata sent to datahub.")
result = "DataHub notification succeeded"
except Exception as e:
result = "DataHub notification failed"
@ -245,6 +253,11 @@ class DataHubValidationAction(ValidationAction):
}
)
)
logger.debug(
"GE expectation_suite_name - {name}, expectation_type - {type}, Assertion URN - {urn}".format(
name=expectation_suite_name, type=expectation_type, urn=assertionUrn
)
)
assertionInfo: AssertionInfo = self.get_assertion_info(
expectation_type,
kwargs,
@ -541,6 +554,8 @@ class DataHubValidationAction(ValidationAction):
def get_dataset_partitions(self, batch_identifier, data_asset):
dataset_partitions = []
logger.debug("Finding datasets being validated")
# for now, we support only v3-api and sqlalchemy execution engine
if isinstance(data_asset, Validator) and isinstance(
data_asset.execution_engine, SqlAlchemyExecutionEngine
@ -645,13 +660,16 @@ class DataHubValidationAction(ValidationAction):
)
else:
warn(
f"DataHubValidationAction does not recognize this GE batch spec type- {type(ge_batch_spec)}."
"DataHubValidationAction does not recognize this GE batch spec type- {batch_spec_type}.".format(
batch_spec_type=type(ge_batch_spec)
)
)
else:
# TODO - v2-spec - SqlAlchemyDataset support
warn(
f"DataHubValidationAction does not recognize this GE data asset type - {type(data_asset)}. \
This is either using v2-api or execution engine other than sqlalchemy."
"DataHubValidationAction does not recognize this GE data asset type - {asset_type}. This is either using v2-api or execution engine other than sqlalchemy.".format(
asset_type=type(data_asset)
)
)
return dataset_partitions
@ -695,8 +713,9 @@ def make_dataset_urn_from_sqlalchemy_uri(
elif data_platform in ["trino", "snowflake"]:
if schema_name is None or url_instance.database is None:
warn(
f"DataHubValidationAction failed to locate schema name and/or database name \
for {data_platform}."
"DataHubValidationAction failed to locate schema name and/or database name for {data_platform}.".format(
data_platform=data_platform
)
)
return None
# If data platform is snowflake, we artificially lowercase the Database name.
@ -711,8 +730,9 @@ def make_dataset_urn_from_sqlalchemy_uri(
elif data_platform == "bigquery":
if url_instance.host is None or url_instance.database is None:
warn(
f"DataHubValidationAction failed to locate host and/or database name for \
{data_platform}. "
"DataHubValidationAction failed to locate host and/or database name for {data_platform}. ".format(
data_platform=data_platform
)
)
return None
schema_name = "{}.{}".format(url_instance.host, url_instance.database)