fix(ingestion): Safeguard against empty values for profile ingestion (#3005)

2025-12-18 05:26:02 +00:00 · 2021-08-02 10:42:40 -07:00 · 2021-08-02 10:42:40 -07:00 · 283f6376d1
commit 283f6376d1
parent c982626867
2 changed files with 29 additions and 25 deletions
--- a/metadata-ingestion/README.md
+++ b/metadata-ingestion/README.md
@ -477,7 +477,7 @@ source:
    profiling:
      enabled: true
      limit: 1000 # optional - max rows to profile
-      offset: 100 # optional - offset of first row to profile
+      offset: 0 # optional - offset of first row to profile
    profile_pattern:
      deny:
        # Skip all tables ending with "_staging"
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -218,7 +218,8 @@ class DatahubGEProfiler:
                column_profile.uniqueProportion = res["observed_value"]
            elif exp == "expect_column_values_to_not_be_null":
                column_profile.nullCount = res["unexpected_count"]
-                column_profile.nullProportion = res["unexpected_percent"] / 100
+                if "unexpected_percent" in res:
                    column_profile.nullProportion = res["unexpected_percent"] / 100
            elif exp == "expect_column_values_to_not_match_regex":
                # ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
                pass
@ -233,35 +234,38 @@ class DatahubGEProfiler:
            elif exp == "expect_column_stdev_to_be_between":
                column_profile.stdev = str(res["observed_value"])
            elif exp == "expect_column_quantile_values_to_be_between":
-                column_profile.quantiles = [
+                if "observed_value" in res:
-                    QuantileClass(quantile=str(quantile), value=str(value))
+                    column_profile.quantiles = [
-                    for quantile, value in zip(
+                        QuantileClass(quantile=str(quantile), value=str(value))
-                        res["observed_value"]["quantiles"],
+                        for quantile, value in zip(
-                        res["observed_value"]["values"],
+                            res["observed_value"]["quantiles"],
-                    )
+                            res["observed_value"]["values"],
-                ]
+                        )
                    ]
            elif exp == "expect_column_values_to_be_in_set":
                column_profile.sampleValues = [
                    str(v) for v in res["partial_unexpected_list"]
                ]
            elif exp == "expect_column_kl_divergence_to_be_less_than":
-                partition = res["details"]["observed_partition"]
+                if "details" in res and "observed_partition" in res["details"]:
-                column_profile.histogram = HistogramClass(
+                    partition = res["details"]["observed_partition"]
-                    [str(v) for v in partition["bins"]],
+                    column_profile.histogram = HistogramClass(
-                    [
+                        [str(v) for v in partition["bins"]],
-                        partition["tail_weights"][0],
+                        [
-                        *partition["weights"],
+                            partition["tail_weights"][0],
-                        partition["tail_weights"][1],
+                            *partition["weights"],
-                    ],
+                            partition["tail_weights"][1],
-                )
+                        ],
                    )
            elif exp == "expect_column_distinct_values_to_be_in_set":
-                # This can be used to produce a bar chart since it includes values and frequencies.
+                if "details" in res and "value_counts" in res["details"]:
-                # As such, it is handled differently from expect_column_values_to_be_in_set, which
+                    # This can be used to produce a bar chart since it includes values and frequencies.
-                # is nonexhaustive.
+                    # As such, it is handled differently from expect_column_values_to_be_in_set, which
-                column_profile.distinctValueFrequencies = [
+                    # is nonexhaustive.
-                    ValueFrequencyClass(value=str(value), frequency=count)
+                    column_profile.distinctValueFrequencies = [
-                    for value, count in res["details"]["value_counts"].items()
+                        ValueFrequencyClass(value=str(value), frequency=count)
-                ]
+                        for value, count in res["details"]["value_counts"].items()
                    ]
            elif exp == "expect_column_values_to_be_in_type_list":
                # ignore; we already know the types for each column via ingestion
                pass