datahub/metadata-ingestion/tests/unit/test_serialized_lru_cache.py

import threading
import time

from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.serialized_lru_cache import serialized_lru_cache


def test_cache_hit() -> None:
    @serialized_lru_cache(maxsize=2)
    def fetch_data(x):
        return x * 2

    assert fetch_data(1) == 2  # Cache miss
    assert fetch_data(1) == 2  # Cache hit
    assert fetch_data.cache_info().hits == 1  # type: ignore
    assert fetch_data.cache_info().misses == 1  # type: ignore


def test_cache_eviction() -> None:
    @serialized_lru_cache(maxsize=2)
    def compute(x):
        return x * 2

    compute(1)
    compute(2)
    compute(3)  # Should evict the first entry (1)
    assert compute.cache_info().currsize == 2  # type: ignore
    assert compute.cache_info().misses == 3  # type: ignore
    assert compute(1) == 2  # Cache miss, since it was evicted
    assert compute.cache_info().misses == 4  # type: ignore


def test_thread_safety() -> None:
    @serialized_lru_cache(maxsize=5)
    def compute(x):
        time.sleep(0.2)  # Simulate some delay
        return x * 2

    threads = []
    results = [None] * 10

    def thread_func(index, arg):
        results[index] = compute(arg)

    with PerfTimer() as timer:
        for i in range(10):
            thread = threading.Thread(target=thread_func, args=(i, i % 5))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

    assert len(set(results)) == 5  # Only 5 unique results should be there
    assert compute.cache_info().currsize <= 5  # type: ignore
    # Only 5 unique calls should miss the cache
    assert compute.cache_info().misses == 5  # type: ignore

    # Should take less than 1 second. If not, it means all calls were run serially.
    assert timer.elapsed_seconds() < 1


def test_concurrent_access_to_same_key() -> None:
    @serialized_lru_cache(maxsize=3)
    def compute(x: int) -> int:
        time.sleep(0.2)  # Simulate some delay
        return x * 2

    threads = []
    results = []

    def thread_func():
        results.append(compute(1))

    with PerfTimer() as timer:
        for _ in range(10):
            thread = threading.Thread(target=thread_func)
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

    assert all(result == 2 for result in results)  # All should compute the same result

    # 9 hits, as the first one is a miss
    assert compute.cache_info().hits == 9  # type: ignore
    # Only the first call is a miss
    assert compute.cache_info().misses == 1  # type: ignore

    # Should take less than 1 second. If not, it means all calls were run serially.
    assert timer.elapsed_seconds() < 1
feat(ingest/snowflake): refactor + parallel schema extraction (#10653) 2024-06-14 13:23:07 -07:00			`import threading`
			`import time`

			`from datahub.utilities.perf_timer import PerfTimer`
			`from datahub.utilities.serialized_lru_cache import serialized_lru_cache`


			`def test_cache_hit() -> None:`
			`@serialized_lru_cache(maxsize=2)`
			`def fetch_data(x):`
			`return x * 2`

			`assert fetch_data(1) == 2 # Cache miss`
			`assert fetch_data(1) == 2 # Cache hit`
			`assert fetch_data.cache_info().hits == 1 # type: ignore`
			`assert fetch_data.cache_info().misses == 1 # type: ignore`


			`def test_cache_eviction() -> None:`
			`@serialized_lru_cache(maxsize=2)`
			`def compute(x):`
			`return x * 2`

			`compute(1)`
			`compute(2)`
			`compute(3) # Should evict the first entry (1)`
			`assert compute.cache_info().currsize == 2 # type: ignore`
			`assert compute.cache_info().misses == 3 # type: ignore`
			`assert compute(1) == 2 # Cache miss, since it was evicted`
			`assert compute.cache_info().misses == 4 # type: ignore`


			`def test_thread_safety() -> None:`
			`@serialized_lru_cache(maxsize=5)`
			`def compute(x):`
			`time.sleep(0.2) # Simulate some delay`
			`return x * 2`

			`threads = []`
			`results = [None] * 10`

			`def thread_func(index, arg):`
			`results[index] = compute(arg)`

			`with PerfTimer() as timer:`
			`for i in range(10):`
			`thread = threading.Thread(target=thread_func, args=(i, i % 5))`
			`threads.append(thread)`
			`thread.start()`

			`for thread in threads:`
			`thread.join()`

			`assert len(set(results)) == 5 # Only 5 unique results should be there`
			`assert compute.cache_info().currsize <= 5 # type: ignore`
			`# Only 5 unique calls should miss the cache`
			`assert compute.cache_info().misses == 5 # type: ignore`

			`# Should take less than 1 second. If not, it means all calls were run serially.`
			`assert timer.elapsed_seconds() < 1`


			`def test_concurrent_access_to_same_key() -> None:`
			`@serialized_lru_cache(maxsize=3)`
			`def compute(x: int) -> int:`
			`time.sleep(0.2) # Simulate some delay`
			`return x * 2`

			`threads = []`
			`results = []`

			`def thread_func():`
			`results.append(compute(1))`

			`with PerfTimer() as timer:`
			`for _ in range(10):`
			`thread = threading.Thread(target=thread_func)`
			`threads.append(thread)`
			`thread.start()`

			`for thread in threads:`
			`thread.join()`

			`assert all(result == 2 for result in results) # All should compute the same result`

			`# 9 hits, as the first one is a miss`
			`assert compute.cache_info().hits == 9 # type: ignore`
			`# Only the first call is a miss`
			`assert compute.cache_info().misses == 1 # type: ignore`

			`# Should take less than 1 second. If not, it means all calls were run serially.`
			`assert timer.elapsed_seconds() < 1`