2022-09-01 14:47:28 -07:00
|
|
|
import random
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
|
|
|
|
def test_lossylist_sampling(length, sampling):
|
2025-01-16 08:19:07 +05:30
|
|
|
l_dict: LossyList[str] = LossyList()
|
2022-09-01 14:47:28 -07:00
|
|
|
for i in range(0, length):
|
2025-01-16 08:19:07 +05:30
|
|
|
l_dict.append(f"{i} Hello World")
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
assert len(l_dict) == length
|
|
|
|
assert l_dict.sampled is sampling
|
2022-09-01 14:47:28 -07:00
|
|
|
if sampling:
|
2025-01-16 08:19:07 +05:30
|
|
|
assert f"... sampled of {length} total elements" in str(l_dict)
|
2022-09-01 14:47:28 -07:00
|
|
|
else:
|
2025-01-16 08:19:07 +05:30
|
|
|
assert "sampled" not in str(l_dict)
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
list_version = [int(i.split(" ")[0]) for i in l_dict]
|
2022-09-01 14:47:28 -07:00
|
|
|
print(list_version)
|
|
|
|
assert sorted(list_version) == list_version
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
|
|
|
|
def test_lossyset_sampling(length, sampling):
|
2025-01-16 08:19:07 +05:30
|
|
|
lossy_set: LossySet[str] = LossySet()
|
2022-09-01 14:47:28 -07:00
|
|
|
for i in range(0, length):
|
2025-01-16 08:19:07 +05:30
|
|
|
lossy_set.add(f"{i} Hello World")
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
assert len(lossy_set) == min(10, length)
|
|
|
|
assert lossy_set.sampled is sampling
|
2022-09-01 14:47:28 -07:00
|
|
|
if sampling:
|
2025-01-18 15:06:20 +05:30
|
|
|
assert f"... sampled with at most {length - 10} elements missing" in str(
|
2025-01-16 08:19:07 +05:30
|
|
|
lossy_set
|
|
|
|
)
|
2022-09-01 14:47:28 -07:00
|
|
|
else:
|
2025-01-16 08:19:07 +05:30
|
|
|
assert "sampled" not in str(lossy_set)
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
list_version = [int(i.split(" ")[0]) for i in lossy_set]
|
2022-09-01 14:47:28 -07:00
|
|
|
set_version = set(list_version)
|
|
|
|
|
|
|
|
assert len(list_version) == len(set_version)
|
|
|
|
assert len(list_version) == min(10, length)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
|
|
|
|
)
|
|
|
|
def test_lossydict_sampling(length, sampling, sub_length):
|
2025-01-16 08:19:07 +05:30
|
|
|
lossy_dict: LossyDict[int, LossyList[str]] = LossyDict()
|
2022-09-01 14:47:28 -07:00
|
|
|
elements_added = 0
|
|
|
|
element_length_map = {}
|
|
|
|
for i in range(0, length):
|
|
|
|
list_length = random.choice(range(1, sub_length))
|
|
|
|
element_length_map[i] = 0
|
2024-03-11 14:14:39 -07:00
|
|
|
for _num_elements in range(0, list_length):
|
2025-01-16 08:19:07 +05:30
|
|
|
if not lossy_dict.get(i):
|
2022-09-01 14:47:28 -07:00
|
|
|
elements_added += 1
|
|
|
|
# reset to 0 until we get it back
|
|
|
|
element_length_map[i] = 0
|
|
|
|
else:
|
2025-01-16 08:19:07 +05:30
|
|
|
element_length_map[i] = len(lossy_dict[i])
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
current_list = lossy_dict.get(i, LossyList())
|
2025-01-18 15:06:20 +05:30
|
|
|
current_list.append(f"{i}:{round(time.time(), 2)} Hello World")
|
2025-01-16 08:19:07 +05:30
|
|
|
lossy_dict[i] = current_list
|
2022-09-01 14:47:28 -07:00
|
|
|
element_length_map[i] += 1
|
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
assert len(lossy_dict) == min(lossy_dict.max_elements, length)
|
|
|
|
assert lossy_dict.sampled is sampling
|
2022-09-01 14:47:28 -07:00
|
|
|
if sampling:
|
2025-01-16 08:19:07 +05:30
|
|
|
assert re.search("sampled of at most .* entries.", str(lossy_dict))
|
|
|
|
assert (
|
|
|
|
f"{lossy_dict.max_elements} sampled of at most {elements_added} entries."
|
|
|
|
in str(lossy_dict)
|
2022-09-01 14:47:28 -07:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
# cheap way to determine that the dict isn't reporting sampled keys
|
2025-01-16 08:19:07 +05:30
|
|
|
assert not re.search("sampled of at most .* entries.", str(lossy_dict))
|
2022-09-01 14:47:28 -07:00
|
|
|
|
2025-01-16 08:19:07 +05:30
|
|
|
for k, v in lossy_dict.items():
|
2022-09-01 14:47:28 -07:00
|
|
|
assert len(v) == element_length_map[k]
|