mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-07 16:16:51 +00:00
fix(ingestion): Safeguard against empty values for profile ingestion (#3005)
This commit is contained in:
parent
c982626867
commit
283f6376d1
@ -477,7 +477,7 @@ source:
|
||||
profiling:
|
||||
enabled: true
|
||||
limit: 1000 # optional - max rows to profile
|
||||
offset: 100 # optional - offset of first row to profile
|
||||
offset: 0 # optional - offset of first row to profile
|
||||
profile_pattern:
|
||||
deny:
|
||||
# Skip all tables ending with "_staging"
|
||||
|
@ -218,6 +218,7 @@ class DatahubGEProfiler:
|
||||
column_profile.uniqueProportion = res["observed_value"]
|
||||
elif exp == "expect_column_values_to_not_be_null":
|
||||
column_profile.nullCount = res["unexpected_count"]
|
||||
if "unexpected_percent" in res:
|
||||
column_profile.nullProportion = res["unexpected_percent"] / 100
|
||||
elif exp == "expect_column_values_to_not_match_regex":
|
||||
# ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
|
||||
@ -233,6 +234,7 @@ class DatahubGEProfiler:
|
||||
elif exp == "expect_column_stdev_to_be_between":
|
||||
column_profile.stdev = str(res["observed_value"])
|
||||
elif exp == "expect_column_quantile_values_to_be_between":
|
||||
if "observed_value" in res:
|
||||
column_profile.quantiles = [
|
||||
QuantileClass(quantile=str(quantile), value=str(value))
|
||||
for quantile, value in zip(
|
||||
@ -245,6 +247,7 @@ class DatahubGEProfiler:
|
||||
str(v) for v in res["partial_unexpected_list"]
|
||||
]
|
||||
elif exp == "expect_column_kl_divergence_to_be_less_than":
|
||||
if "details" in res and "observed_partition" in res["details"]:
|
||||
partition = res["details"]["observed_partition"]
|
||||
column_profile.histogram = HistogramClass(
|
||||
[str(v) for v in partition["bins"]],
|
||||
@ -255,6 +258,7 @@ class DatahubGEProfiler:
|
||||
],
|
||||
)
|
||||
elif exp == "expect_column_distinct_values_to_be_in_set":
|
||||
if "details" in res and "value_counts" in res["details"]:
|
||||
# This can be used to produce a bar chart since it includes values and frequencies.
|
||||
# As such, it is handled differently from expect_column_values_to_be_in_set, which
|
||||
# is nonexhaustive.
|
||||
|
Loading…
x
Reference in New Issue
Block a user