Fix histogram labels (#10797)

* fix: added logic to format bin labels for histogram metric

* fix: histogram labels

* fix: added types for arguments
This commit is contained in:
Teddy 2023-03-28 12:59:45 +02:00 committed by GitHub
parent 52c9a0f0df
commit fc9b64b52a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 105 additions and 5 deletions

View File

@ -24,6 +24,7 @@ from metadata.profiler.metrics.static.count import Count
from metadata.profiler.metrics.static.max import Max
from metadata.profiler.metrics.static.min import Min
from metadata.profiler.orm.registry import is_quantifiable
from metadata.utils.helpers import format_large_string_numbers
from metadata.utils.logger import profiler_logger
logger = profiler_logger()
@ -74,6 +75,27 @@ class Histogram(HybridMetric):
float(res_max),
) # Decimal to float
@staticmethod
def _format_bin_labels(
lower_bin: Union[float, int], upper_bin: Optional[Union[float, int]] = None
) -> str:
"""format bin labels
Args:
lower_bin: lower bin
upper_bin: upper bin. Defaults to None.
Returns:
str: formatted bin labels
"""
if lower_bin is None:
formatted_lower_bin = "null"
else:
formatted_lower_bin = format_large_string_numbers(lower_bin)
if upper_bin is None:
return f"{formatted_lower_bin} and up"
return f"{formatted_lower_bin} to {format_large_string_numbers(upper_bin)}"
def fn(
self,
sample: Optional[DeclarativeMeta],
@ -120,14 +142,17 @@ class Histogram(HybridMetric):
condition = and_(col >= starting_bin_bound)
case_stmts.append(
func.count(case([(condition, col)])).label(
f"{starting_bin_bound:.2f} and up"
self._format_bin_labels(starting_bin_bound)
)
)
continue
case_stmts.append(
func.count(case([(condition, col)])).label(
f"{starting_bin_bound:.2f} to {ending_bin_bound:.2f}"
self._format_bin_labels(
starting_bin_bound,
ending_bin_bound,
)
)
)
starting_bin_bound = ending_bin_bound
@ -176,9 +201,9 @@ class Histogram(HybridMetric):
bins = list(np.arange(num_bins) * bind_width + res_min)
bins_label = [
f"{bins[i]:.2f} to {bins[i+1]:.2f}"
self._format_bin_labels(bins[i], bins[i + 1])
if i < len(bins) - 1
else f"{bins[i]:.2f} and up"
else self._format_bin_labels(bins[i])
for i in range(len(bins))
]

View File

@ -18,6 +18,7 @@ from __future__ import annotations
import re
from datetime import datetime, timedelta
from functools import wraps
from math import floor, log
from time import perf_counter
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
@ -327,3 +328,18 @@ def get_entity_tier_from_tags(tags: list[TagLabel]) -> Optional[str]:
),
None,
)
def format_large_string_numbers(number: Union[float, int]) -> str:
"""Format large string number to a human readable format.
(e.g. 1,000,000 -> 1M, 1,000,000,000 -> 1B, etc)
Args:
number: number
"""
if number == 0:
return "0"
units = ["", "K", "M", "B", "T"]
constant_k = 1000.0
magnitude = int(floor(log(abs(number), constant_k)))
return f"{number / constant_k**magnitude:.2f}{units[magnitude]}"

View File

@ -0,0 +1,60 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests utils function for the profiler
"""
from unittest import TestCase
from metadata.profiler.metrics.hybrid.histogram import Histogram
class TestHistogramUtils(TestCase):
@classmethod
def setUpClass(cls):
cls.histogram = Histogram()
def test_histogram_label_formatter_positive(self):
"""test label formatter for histogram"""
formatted_label = self.histogram._format_bin_labels(18927, 23456)
assert formatted_label == "18.93K to 23.46K"
formatted_label = self.histogram._format_bin_labels(18927)
assert formatted_label == "18.93K and up"
def test_histogram_label_formatter_negative(self):
"""test label formatter for histogram for negative numbers"""
formatted_label = self.histogram._format_bin_labels(-18927, -23456)
assert formatted_label == "-18.93K to -23.46K"
formatted_label = self.histogram._format_bin_labels(-18927)
assert formatted_label == "-18.93K and up"
def test_histogram_label_formatter_none(self):
"""test label formatter for histogram for None"""
formatted_label = self.histogram._format_bin_labels(None)
assert formatted_label == "null and up"
def test_histogram_label_formatter_zero(self):
"""test label formatter for histogram with zero"""
formatted_label = self.histogram._format_bin_labels(0)
assert formatted_label == "0 and up"
def test_histogram_label_formatter_nines(self):
"""test label formatter for histogram for nines"""
formatted_label = self.histogram._format_bin_labels(99999999)
assert formatted_label == "100.00M and up"
def test_histogram_label_formatter_floats(self):
"""test label formatter for histogram for floats"""
formatted_label = self.histogram._format_bin_labels(167893.98542, 194993.98542)
assert formatted_label == "167.89K to 194.99K"

View File

@ -103,7 +103,6 @@ const DataDistributionHistogram = ({
<CartesianGrid stroke={GRAPH_BACKGROUND_COLOR} />
<XAxis
dataKey="name"
interval={0}
padding={{ left: 16, right: 16 }}
tick={{ fontSize: 12 }}
/>