mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-07 16:16:51 +00:00
fix(ingestion): Safeguard against empty values for profile ingestion (#3005)
This commit is contained in:
parent
c982626867
commit
283f6376d1
@ -477,7 +477,7 @@ source:
|
|||||||
profiling:
|
profiling:
|
||||||
enabled: true
|
enabled: true
|
||||||
limit: 1000 # optional - max rows to profile
|
limit: 1000 # optional - max rows to profile
|
||||||
offset: 100 # optional - offset of first row to profile
|
offset: 0 # optional - offset of first row to profile
|
||||||
profile_pattern:
|
profile_pattern:
|
||||||
deny:
|
deny:
|
||||||
# Skip all tables ending with "_staging"
|
# Skip all tables ending with "_staging"
|
||||||
|
@ -218,7 +218,8 @@ class DatahubGEProfiler:
|
|||||||
column_profile.uniqueProportion = res["observed_value"]
|
column_profile.uniqueProportion = res["observed_value"]
|
||||||
elif exp == "expect_column_values_to_not_be_null":
|
elif exp == "expect_column_values_to_not_be_null":
|
||||||
column_profile.nullCount = res["unexpected_count"]
|
column_profile.nullCount = res["unexpected_count"]
|
||||||
column_profile.nullProportion = res["unexpected_percent"] / 100
|
if "unexpected_percent" in res:
|
||||||
|
column_profile.nullProportion = res["unexpected_percent"] / 100
|
||||||
elif exp == "expect_column_values_to_not_match_regex":
|
elif exp == "expect_column_values_to_not_match_regex":
|
||||||
# ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
|
# ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
|
||||||
pass
|
pass
|
||||||
@ -233,35 +234,38 @@ class DatahubGEProfiler:
|
|||||||
elif exp == "expect_column_stdev_to_be_between":
|
elif exp == "expect_column_stdev_to_be_between":
|
||||||
column_profile.stdev = str(res["observed_value"])
|
column_profile.stdev = str(res["observed_value"])
|
||||||
elif exp == "expect_column_quantile_values_to_be_between":
|
elif exp == "expect_column_quantile_values_to_be_between":
|
||||||
column_profile.quantiles = [
|
if "observed_value" in res:
|
||||||
QuantileClass(quantile=str(quantile), value=str(value))
|
column_profile.quantiles = [
|
||||||
for quantile, value in zip(
|
QuantileClass(quantile=str(quantile), value=str(value))
|
||||||
res["observed_value"]["quantiles"],
|
for quantile, value in zip(
|
||||||
res["observed_value"]["values"],
|
res["observed_value"]["quantiles"],
|
||||||
)
|
res["observed_value"]["values"],
|
||||||
]
|
)
|
||||||
|
]
|
||||||
elif exp == "expect_column_values_to_be_in_set":
|
elif exp == "expect_column_values_to_be_in_set":
|
||||||
column_profile.sampleValues = [
|
column_profile.sampleValues = [
|
||||||
str(v) for v in res["partial_unexpected_list"]
|
str(v) for v in res["partial_unexpected_list"]
|
||||||
]
|
]
|
||||||
elif exp == "expect_column_kl_divergence_to_be_less_than":
|
elif exp == "expect_column_kl_divergence_to_be_less_than":
|
||||||
partition = res["details"]["observed_partition"]
|
if "details" in res and "observed_partition" in res["details"]:
|
||||||
column_profile.histogram = HistogramClass(
|
partition = res["details"]["observed_partition"]
|
||||||
[str(v) for v in partition["bins"]],
|
column_profile.histogram = HistogramClass(
|
||||||
[
|
[str(v) for v in partition["bins"]],
|
||||||
partition["tail_weights"][0],
|
[
|
||||||
*partition["weights"],
|
partition["tail_weights"][0],
|
||||||
partition["tail_weights"][1],
|
*partition["weights"],
|
||||||
],
|
partition["tail_weights"][1],
|
||||||
)
|
],
|
||||||
|
)
|
||||||
elif exp == "expect_column_distinct_values_to_be_in_set":
|
elif exp == "expect_column_distinct_values_to_be_in_set":
|
||||||
# This can be used to produce a bar chart since it includes values and frequencies.
|
if "details" in res and "value_counts" in res["details"]:
|
||||||
# As such, it is handled differently from expect_column_values_to_be_in_set, which
|
# This can be used to produce a bar chart since it includes values and frequencies.
|
||||||
# is nonexhaustive.
|
# As such, it is handled differently from expect_column_values_to_be_in_set, which
|
||||||
column_profile.distinctValueFrequencies = [
|
# is nonexhaustive.
|
||||||
ValueFrequencyClass(value=str(value), frequency=count)
|
column_profile.distinctValueFrequencies = [
|
||||||
for value, count in res["details"]["value_counts"].items()
|
ValueFrequencyClass(value=str(value), frequency=count)
|
||||||
]
|
for value, count in res["details"]["value_counts"].items()
|
||||||
|
]
|
||||||
elif exp == "expect_column_values_to_be_in_type_list":
|
elif exp == "expect_column_values_to_be_in_type_list":
|
||||||
# ignore; we already know the types for each column via ingestion
|
# ignore; we already know the types for each column via ingestion
|
||||||
pass
|
pass
|
||||||
|
Loading…
x
Reference in New Issue
Block a user