fix(ingestion): Safeguard against empty values for profile ingestion (#3005)

This commit is contained in:
Dexter Lee 2021-08-02 10:42:40 -07:00 committed by GitHub
parent c982626867
commit 283f6376d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 25 deletions

View File

@ -477,7 +477,7 @@ source:
profiling: profiling:
enabled: true enabled: true
limit: 1000 # optional - max rows to profile limit: 1000 # optional - max rows to profile
offset: 100 # optional - offset of first row to profile offset: 0 # optional - offset of first row to profile
profile_pattern: profile_pattern:
deny: deny:
# Skip all tables ending with "_staging" # Skip all tables ending with "_staging"

View File

@ -218,7 +218,8 @@ class DatahubGEProfiler:
column_profile.uniqueProportion = res["observed_value"] column_profile.uniqueProportion = res["observed_value"]
elif exp == "expect_column_values_to_not_be_null": elif exp == "expect_column_values_to_not_be_null":
column_profile.nullCount = res["unexpected_count"] column_profile.nullCount = res["unexpected_count"]
column_profile.nullProportion = res["unexpected_percent"] / 100 if "unexpected_percent" in res:
column_profile.nullProportion = res["unexpected_percent"] / 100
elif exp == "expect_column_values_to_not_match_regex": elif exp == "expect_column_values_to_not_match_regex":
# ignore; generally used for whitespace checks using regex r"^\s+|\s+$" # ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
pass pass
@ -233,35 +234,38 @@ class DatahubGEProfiler:
elif exp == "expect_column_stdev_to_be_between": elif exp == "expect_column_stdev_to_be_between":
column_profile.stdev = str(res["observed_value"]) column_profile.stdev = str(res["observed_value"])
elif exp == "expect_column_quantile_values_to_be_between": elif exp == "expect_column_quantile_values_to_be_between":
column_profile.quantiles = [ if "observed_value" in res:
QuantileClass(quantile=str(quantile), value=str(value)) column_profile.quantiles = [
for quantile, value in zip( QuantileClass(quantile=str(quantile), value=str(value))
res["observed_value"]["quantiles"], for quantile, value in zip(
res["observed_value"]["values"], res["observed_value"]["quantiles"],
) res["observed_value"]["values"],
] )
]
elif exp == "expect_column_values_to_be_in_set": elif exp == "expect_column_values_to_be_in_set":
column_profile.sampleValues = [ column_profile.sampleValues = [
str(v) for v in res["partial_unexpected_list"] str(v) for v in res["partial_unexpected_list"]
] ]
elif exp == "expect_column_kl_divergence_to_be_less_than": elif exp == "expect_column_kl_divergence_to_be_less_than":
partition = res["details"]["observed_partition"] if "details" in res and "observed_partition" in res["details"]:
column_profile.histogram = HistogramClass( partition = res["details"]["observed_partition"]
[str(v) for v in partition["bins"]], column_profile.histogram = HistogramClass(
[ [str(v) for v in partition["bins"]],
partition["tail_weights"][0], [
*partition["weights"], partition["tail_weights"][0],
partition["tail_weights"][1], *partition["weights"],
], partition["tail_weights"][1],
) ],
)
elif exp == "expect_column_distinct_values_to_be_in_set": elif exp == "expect_column_distinct_values_to_be_in_set":
# This can be used to produce a bar chart since it includes values and frequencies. if "details" in res and "value_counts" in res["details"]:
# As such, it is handled differently from expect_column_values_to_be_in_set, which # This can be used to produce a bar chart since it includes values and frequencies.
# is nonexhaustive. # As such, it is handled differently from expect_column_values_to_be_in_set, which
column_profile.distinctValueFrequencies = [ # is nonexhaustive.
ValueFrequencyClass(value=str(value), frequency=count) column_profile.distinctValueFrequencies = [
for value, count in res["details"]["value_counts"].items() ValueFrequencyClass(value=str(value), frequency=count)
] for value, count in res["details"]["value_counts"].items()
]
elif exp == "expect_column_values_to_be_in_type_list": elif exp == "expect_column_values_to_be_in_type_list":
# ignore; we already know the types for each column via ingestion # ignore; we already know the types for each column via ingestion
pass pass