fix(ingestion): Safeguard against empty values for profile ingestion (#3005)

This commit is contained in:
Dexter Lee 2021-08-02 10:42:40 -07:00 committed by GitHub
parent c982626867
commit 283f6376d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 25 deletions

View File

@ -477,7 +477,7 @@ source:
profiling:
enabled: true
limit: 1000 # optional - max rows to profile
offset: 100 # optional - offset of first row to profile
offset: 0 # optional - offset of first row to profile
profile_pattern:
deny:
# Skip all tables ending with "_staging"

View File

@ -218,6 +218,7 @@ class DatahubGEProfiler:
column_profile.uniqueProportion = res["observed_value"]
elif exp == "expect_column_values_to_not_be_null":
column_profile.nullCount = res["unexpected_count"]
if "unexpected_percent" in res:
column_profile.nullProportion = res["unexpected_percent"] / 100
elif exp == "expect_column_values_to_not_match_regex":
# ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
@ -233,6 +234,7 @@ class DatahubGEProfiler:
elif exp == "expect_column_stdev_to_be_between":
column_profile.stdev = str(res["observed_value"])
elif exp == "expect_column_quantile_values_to_be_between":
if "observed_value" in res:
column_profile.quantiles = [
QuantileClass(quantile=str(quantile), value=str(value))
for quantile, value in zip(
@ -245,6 +247,7 @@ class DatahubGEProfiler:
str(v) for v in res["partial_unexpected_list"]
]
elif exp == "expect_column_kl_divergence_to_be_less_than":
if "details" in res and "observed_partition" in res["details"]:
partition = res["details"]["observed_partition"]
column_profile.histogram = HistogramClass(
[str(v) for v in partition["bins"]],
@ -255,6 +258,7 @@ class DatahubGEProfiler:
],
)
elif exp == "expect_column_distinct_values_to_be_in_set":
if "details" in res and "value_counts" in res["details"]:
# This can be used to produce a bar chart since it includes values and frequencies.
# As such, it is handled differently from expect_column_values_to_be_in_set, which
# is nonexhaustive.