From fc9b64b52ad0441828fa90166555fb65fcf76eb0 Mon Sep 17 00:00:00 2001 From: Teddy Date: Tue, 28 Mar 2023 12:59:45 +0200 Subject: [PATCH] Fix histogram labels (#10797) * fix: added logic to format bin labels for histogram metric * fix: histogram labels * fix: added types for arguments --- .../profiler/metrics/hybrid/histogram.py | 33 ++++++++-- ingestion/src/metadata/utils/helpers.py | 16 +++++ ingestion/tests/unit/profiler/test_utils.py | 60 +++++++++++++++++++ .../DataDistributionHistogram.component.tsx | 1 - 4 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 ingestion/tests/unit/profiler/test_utils.py diff --git a/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py b/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py index 8eebbc12112..922318cccb7 100644 --- a/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py +++ b/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py @@ -24,6 +24,7 @@ from metadata.profiler.metrics.static.count import Count from metadata.profiler.metrics.static.max import Max from metadata.profiler.metrics.static.min import Min from metadata.profiler.orm.registry import is_quantifiable +from metadata.utils.helpers import format_large_string_numbers from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -74,6 +75,27 @@ class Histogram(HybridMetric): float(res_max), ) # Decimal to float + @staticmethod + def _format_bin_labels( + lower_bin: Union[float, int], upper_bin: Optional[Union[float, int]] = None + ) -> str: + """format bin labels + + Args: + lower_bin: lower bin + upper_bin: upper bin. Defaults to None. + + Returns: + str: formatted bin labels + """ + if lower_bin is None: + formatted_lower_bin = "null" + else: + formatted_lower_bin = format_large_string_numbers(lower_bin) + if upper_bin is None: + return f"{formatted_lower_bin} and up" + return f"{formatted_lower_bin} to {format_large_string_numbers(upper_bin)}" + def fn( self, sample: Optional[DeclarativeMeta], @@ -120,14 +142,17 @@ class Histogram(HybridMetric): condition = and_(col >= starting_bin_bound) case_stmts.append( func.count(case([(condition, col)])).label( - f"{starting_bin_bound:.2f} and up" + self._format_bin_labels(starting_bin_bound) ) ) continue case_stmts.append( func.count(case([(condition, col)])).label( - f"{starting_bin_bound:.2f} to {ending_bin_bound:.2f}" + self._format_bin_labels( + starting_bin_bound, + ending_bin_bound, + ) ) ) starting_bin_bound = ending_bin_bound @@ -176,9 +201,9 @@ class Histogram(HybridMetric): bins = list(np.arange(num_bins) * bind_width + res_min) bins_label = [ - f"{bins[i]:.2f} to {bins[i+1]:.2f}" + self._format_bin_labels(bins[i], bins[i + 1]) if i < len(bins) - 1 - else f"{bins[i]:.2f} and up" + else self._format_bin_labels(bins[i]) for i in range(len(bins)) ] diff --git a/ingestion/src/metadata/utils/helpers.py b/ingestion/src/metadata/utils/helpers.py index b52b9ba71f7..15ade5f1fe5 100644 --- a/ingestion/src/metadata/utils/helpers.py +++ b/ingestion/src/metadata/utils/helpers.py @@ -18,6 +18,7 @@ from __future__ import annotations import re from datetime import datetime, timedelta from functools import wraps +from math import floor, log from time import perf_counter from typing import Any, Dict, Iterable, List, Optional, Tuple, Union @@ -327,3 +328,18 @@ def get_entity_tier_from_tags(tags: list[TagLabel]) -> Optional[str]: ), None, ) + + +def format_large_string_numbers(number: Union[float, int]) -> str: + """Format large string number to a human readable format. + (e.g. 1,000,000 -> 1M, 1,000,000,000 -> 1B, etc) + + Args: + number: number + """ + if number == 0: + return "0" + units = ["", "K", "M", "B", "T"] + constant_k = 1000.0 + magnitude = int(floor(log(abs(number), constant_k))) + return f"{number / constant_k**magnitude:.2f}{units[magnitude]}" diff --git a/ingestion/tests/unit/profiler/test_utils.py b/ingestion/tests/unit/profiler/test_utils.py new file mode 100644 index 00000000000..03a541d0b15 --- /dev/null +++ b/ingestion/tests/unit/profiler/test_utils.py @@ -0,0 +1,60 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests utils function for the profiler +""" + +from unittest import TestCase + +from metadata.profiler.metrics.hybrid.histogram import Histogram + + +class TestHistogramUtils(TestCase): + @classmethod + def setUpClass(cls): + cls.histogram = Histogram() + + def test_histogram_label_formatter_positive(self): + """test label formatter for histogram""" + formatted_label = self.histogram._format_bin_labels(18927, 23456) + assert formatted_label == "18.93K to 23.46K" + + formatted_label = self.histogram._format_bin_labels(18927) + assert formatted_label == "18.93K and up" + + def test_histogram_label_formatter_negative(self): + """test label formatter for histogram for negative numbers""" + formatted_label = self.histogram._format_bin_labels(-18927, -23456) + assert formatted_label == "-18.93K to -23.46K" + + formatted_label = self.histogram._format_bin_labels(-18927) + assert formatted_label == "-18.93K and up" + + def test_histogram_label_formatter_none(self): + """test label formatter for histogram for None""" + formatted_label = self.histogram._format_bin_labels(None) + assert formatted_label == "null and up" + + def test_histogram_label_formatter_zero(self): + """test label formatter for histogram with zero""" + formatted_label = self.histogram._format_bin_labels(0) + assert formatted_label == "0 and up" + + def test_histogram_label_formatter_nines(self): + """test label formatter for histogram for nines""" + formatted_label = self.histogram._format_bin_labels(99999999) + assert formatted_label == "100.00M and up" + + def test_histogram_label_formatter_floats(self): + """test label formatter for histogram for floats""" + formatted_label = self.histogram._format_bin_labels(167893.98542, 194993.98542) + assert formatted_label == "167.89K to 194.99K" diff --git a/openmetadata-ui/src/main/resources/ui/src/components/Chart/DataDistributionHistogram.component.tsx b/openmetadata-ui/src/main/resources/ui/src/components/Chart/DataDistributionHistogram.component.tsx index b05f514b56d..03655e9bac8 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/Chart/DataDistributionHistogram.component.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/Chart/DataDistributionHistogram.component.tsx @@ -103,7 +103,6 @@ const DataDistributionHistogram = ({