Fix histogram labels (#10797)

* fix: added logic to format bin labels for histogram metric

* fix: histogram labels

* fix: added types for arguments
This commit is contained in:
Teddy 2023-03-28 12:59:45 +02:00 committed by GitHub
parent 52c9a0f0df
commit fc9b64b52a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 105 additions and 5 deletions

View File

@ -24,6 +24,7 @@ from metadata.profiler.metrics.static.count import Count
from metadata.profiler.metrics.static.max import Max from metadata.profiler.metrics.static.max import Max
from metadata.profiler.metrics.static.min import Min from metadata.profiler.metrics.static.min import Min
from metadata.profiler.orm.registry import is_quantifiable from metadata.profiler.orm.registry import is_quantifiable
from metadata.utils.helpers import format_large_string_numbers
from metadata.utils.logger import profiler_logger from metadata.utils.logger import profiler_logger
logger = profiler_logger() logger = profiler_logger()
@ -74,6 +75,27 @@ class Histogram(HybridMetric):
float(res_max), float(res_max),
) # Decimal to float ) # Decimal to float
@staticmethod
def _format_bin_labels(
lower_bin: Union[float, int], upper_bin: Optional[Union[float, int]] = None
) -> str:
"""format bin labels
Args:
lower_bin: lower bin
upper_bin: upper bin. Defaults to None.
Returns:
str: formatted bin labels
"""
if lower_bin is None:
formatted_lower_bin = "null"
else:
formatted_lower_bin = format_large_string_numbers(lower_bin)
if upper_bin is None:
return f"{formatted_lower_bin} and up"
return f"{formatted_lower_bin} to {format_large_string_numbers(upper_bin)}"
def fn( def fn(
self, self,
sample: Optional[DeclarativeMeta], sample: Optional[DeclarativeMeta],
@ -120,14 +142,17 @@ class Histogram(HybridMetric):
condition = and_(col >= starting_bin_bound) condition = and_(col >= starting_bin_bound)
case_stmts.append( case_stmts.append(
func.count(case([(condition, col)])).label( func.count(case([(condition, col)])).label(
f"{starting_bin_bound:.2f} and up" self._format_bin_labels(starting_bin_bound)
) )
) )
continue continue
case_stmts.append( case_stmts.append(
func.count(case([(condition, col)])).label( func.count(case([(condition, col)])).label(
f"{starting_bin_bound:.2f} to {ending_bin_bound:.2f}" self._format_bin_labels(
starting_bin_bound,
ending_bin_bound,
)
) )
) )
starting_bin_bound = ending_bin_bound starting_bin_bound = ending_bin_bound
@ -176,9 +201,9 @@ class Histogram(HybridMetric):
bins = list(np.arange(num_bins) * bind_width + res_min) bins = list(np.arange(num_bins) * bind_width + res_min)
bins_label = [ bins_label = [
f"{bins[i]:.2f} to {bins[i+1]:.2f}" self._format_bin_labels(bins[i], bins[i + 1])
if i < len(bins) - 1 if i < len(bins) - 1
else f"{bins[i]:.2f} and up" else self._format_bin_labels(bins[i])
for i in range(len(bins)) for i in range(len(bins))
] ]

View File

@ -18,6 +18,7 @@ from __future__ import annotations
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from functools import wraps from functools import wraps
from math import floor, log
from time import perf_counter from time import perf_counter
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
@ -327,3 +328,18 @@ def get_entity_tier_from_tags(tags: list[TagLabel]) -> Optional[str]:
), ),
None, None,
) )
def format_large_string_numbers(number: Union[float, int]) -> str:
"""Format large string number to a human readable format.
(e.g. 1,000,000 -> 1M, 1,000,000,000 -> 1B, etc)
Args:
number: number
"""
if number == 0:
return "0"
units = ["", "K", "M", "B", "T"]
constant_k = 1000.0
magnitude = int(floor(log(abs(number), constant_k)))
return f"{number / constant_k**magnitude:.2f}{units[magnitude]}"

View File

@ -0,0 +1,60 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests utils function for the profiler
"""
from unittest import TestCase
from metadata.profiler.metrics.hybrid.histogram import Histogram
class TestHistogramUtils(TestCase):
@classmethod
def setUpClass(cls):
cls.histogram = Histogram()
def test_histogram_label_formatter_positive(self):
"""test label formatter for histogram"""
formatted_label = self.histogram._format_bin_labels(18927, 23456)
assert formatted_label == "18.93K to 23.46K"
formatted_label = self.histogram._format_bin_labels(18927)
assert formatted_label == "18.93K and up"
def test_histogram_label_formatter_negative(self):
"""test label formatter for histogram for negative numbers"""
formatted_label = self.histogram._format_bin_labels(-18927, -23456)
assert formatted_label == "-18.93K to -23.46K"
formatted_label = self.histogram._format_bin_labels(-18927)
assert formatted_label == "-18.93K and up"
def test_histogram_label_formatter_none(self):
"""test label formatter for histogram for None"""
formatted_label = self.histogram._format_bin_labels(None)
assert formatted_label == "null and up"
def test_histogram_label_formatter_zero(self):
"""test label formatter for histogram with zero"""
formatted_label = self.histogram._format_bin_labels(0)
assert formatted_label == "0 and up"
def test_histogram_label_formatter_nines(self):
"""test label formatter for histogram for nines"""
formatted_label = self.histogram._format_bin_labels(99999999)
assert formatted_label == "100.00M and up"
def test_histogram_label_formatter_floats(self):
"""test label formatter for histogram for floats"""
formatted_label = self.histogram._format_bin_labels(167893.98542, 194993.98542)
assert formatted_label == "167.89K to 194.99K"

View File

@ -103,7 +103,6 @@ const DataDistributionHistogram = ({
<CartesianGrid stroke={GRAPH_BACKGROUND_COLOR} /> <CartesianGrid stroke={GRAPH_BACKGROUND_COLOR} />
<XAxis <XAxis
dataKey="name" dataKey="name"
interval={0}
padding={{ left: 16, right: 16 }} padding={{ left: 16, right: 16 }}
tick={{ fontSize: 12 }} tick={{ fontSize: 12 }}
/> />