mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-08 23:14:00 +00:00
[TASK-6241] Cache ES ingestion when ingesting usage (#6276)
* Cache ingestion * Uncomment secrets manager * Fix experiment stuff * Fix style * Add LRU cache * Add tests * Fix code smell
This commit is contained in:
parent
4d4a2fc2cf
commit
756dae5605
@ -36,6 +36,7 @@ from metadata.ingestion.ometa.client import APIError
|
|||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
|
from metadata.utils.lru_cache import LRUCache
|
||||||
from metadata.utils.sql_lineage import (
|
from metadata.utils.sql_lineage import (
|
||||||
get_column_fqn,
|
get_column_fqn,
|
||||||
get_lineage_by_query,
|
get_lineage_by_query,
|
||||||
@ -44,6 +45,8 @@ from metadata.utils.sql_lineage import (
|
|||||||
|
|
||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
|
||||||
|
LRU_CACHE_SIZE = 4096
|
||||||
|
|
||||||
|
|
||||||
class MetadataUsageSinkConfig(ConfigModel):
|
class MetadataUsageSinkConfig(ConfigModel):
|
||||||
filename: str
|
filename: str
|
||||||
@ -85,8 +88,11 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
for query in queries
|
for query in queries
|
||||||
if "create" in query.query.lower() or "insert" in query.query.lower()
|
if "create" in query.query.lower() or "insert" in query.query.lower()
|
||||||
]
|
]
|
||||||
|
seen_queries = LRUCache(LRU_CACHE_SIZE)
|
||||||
|
|
||||||
for query in create_or_insert_queries:
|
for query in create_or_insert_queries:
|
||||||
|
if query in seen_queries:
|
||||||
|
continue
|
||||||
lineages = get_lineage_by_query(
|
lineages = get_lineage_by_query(
|
||||||
self.metadata,
|
self.metadata,
|
||||||
query=query,
|
query=query,
|
||||||
@ -97,6 +103,7 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
for lineage in lineages or []:
|
for lineage in lineages or []:
|
||||||
created_lineage = self.metadata.add_lineage(lineage)
|
created_lineage = self.metadata.add_lineage(lineage)
|
||||||
logger.info(f"Successfully added Lineage {created_lineage}")
|
logger.info(f"Successfully added Lineage {created_lineage}")
|
||||||
|
seen_queries.put(query, None) # None because it really doesn't matter.
|
||||||
|
|
||||||
def __populate_table_usage_map(
|
def __populate_table_usage_map(
|
||||||
self, table_entity: Table, table_usage: TableUsageCount
|
self, table_entity: Table, table_usage: TableUsageCount
|
||||||
@ -259,12 +266,12 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
def __get_table_joins(
|
def __get_table_joins(
|
||||||
self, table_entity: Table, table_usage: TableUsageCount
|
self, table_entity: Table, table_usage: TableUsageCount
|
||||||
) -> TableJoins:
|
) -> TableJoins:
|
||||||
table_joins: TableJoins = TableJoins(
|
|
||||||
columnJoins=[], directTableJoins=[], startDate=table_usage.date
|
|
||||||
)
|
|
||||||
"""
|
"""
|
||||||
Method to get Table Joins
|
Method to get Table Joins
|
||||||
"""
|
"""
|
||||||
|
table_joins: TableJoins = TableJoins(
|
||||||
|
columnJoins=[], directTableJoins=[], startDate=table_usage.date
|
||||||
|
)
|
||||||
column_joins_dict = {}
|
column_joins_dict = {}
|
||||||
for column_join in table_usage.joins:
|
for column_join in table_usage.joins:
|
||||||
joined_with = {}
|
joined_with = {}
|
||||||
|
|||||||
@ -31,9 +31,12 @@ from metadata.generated.schema.entity.data.table import (
|
|||||||
from metadata.generated.schema.type.usageRequest import UsageRequest
|
from metadata.generated.schema.type.usageRequest import UsageRequest
|
||||||
from metadata.ingestion.ometa.client import REST
|
from metadata.ingestion.ometa.client import REST
|
||||||
from metadata.ingestion.ometa.utils import ometa_logger
|
from metadata.ingestion.ometa.utils import ometa_logger
|
||||||
|
from metadata.utils.lru_cache import LRUCache
|
||||||
|
|
||||||
logger = ometa_logger()
|
logger = ometa_logger()
|
||||||
|
|
||||||
|
LRU_CACHE_SIZE = 4096
|
||||||
|
|
||||||
|
|
||||||
class OMetaTableMixin:
|
class OMetaTableMixin:
|
||||||
"""
|
"""
|
||||||
@ -128,11 +131,14 @@ class OMetaTableMixin:
|
|||||||
:param table: Table Entity to update
|
:param table: Table Entity to update
|
||||||
:param table_queries: SqlQuery to add
|
:param table_queries: SqlQuery to add
|
||||||
"""
|
"""
|
||||||
|
seen_queries = LRUCache(LRU_CACHE_SIZE)
|
||||||
for query in table_queries:
|
for query in table_queries:
|
||||||
|
if query.query not in seen_queries:
|
||||||
self.client.put(
|
self.client.put(
|
||||||
f"{self.get_suffix(Table)}/{table.id.__root__}/tableQuery",
|
f"{self.get_suffix(Table)}/{table.id.__root__}/tableQuery",
|
||||||
data=query.json(),
|
data=query.json(),
|
||||||
)
|
)
|
||||||
|
seen_queries.put(query.query, None)
|
||||||
|
|
||||||
def publish_table_usage(
|
def publish_table_usage(
|
||||||
self, table: Table, table_usage_request: UsageRequest
|
self, table: Table, table_usage_request: UsageRequest
|
||||||
|
|||||||
@ -92,7 +92,7 @@ class UsageSource(Source[TableQuery], ABC):
|
|||||||
datetime.utcnow()
|
datetime.utcnow()
|
||||||
if not query_dict.get("start_time")
|
if not query_dict.get("start_time")
|
||||||
else datetime.strptime(
|
else datetime.strptime(
|
||||||
query_dict.get("start_time"), "%Y-%m-%d %H:%M:%S"
|
query_dict.get("start_time"), "%Y-%m-%d %H:%M:%S.%f"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
query_list.append(
|
query_list.append(
|
||||||
@ -101,7 +101,7 @@ class UsageSource(Source[TableQuery], ABC):
|
|||||||
userName=query_dict.get("user_name", ""),
|
userName=query_dict.get("user_name", ""),
|
||||||
startTime=query_dict.get("start_time", ""),
|
startTime=query_dict.get("start_time", ""),
|
||||||
endTime=query_dict.get("end_time", ""),
|
endTime=query_dict.get("end_time", ""),
|
||||||
analysisDate=analysis_date.date(),
|
analysisDate=analysis_date,
|
||||||
aborted=self.get_aborted_status(query_dict),
|
aborted=self.get_aborted_status(query_dict),
|
||||||
databaseName=self.get_database_name(query_dict),
|
databaseName=self.get_database_name(query_dict),
|
||||||
serviceName=self.config.serviceName,
|
serviceName=self.config.serviceName,
|
||||||
|
|||||||
43
ingestion/src/metadata/utils/lru_cache.py
Normal file
43
ingestion/src/metadata/utils/lru_cache.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
"""
|
||||||
|
LRU cache
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
class LRUCache:
|
||||||
|
"""Least Recently Used cache"""
|
||||||
|
|
||||||
|
def __init__(self, capacity: int) -> None:
|
||||||
|
self._cache = OrderedDict()
|
||||||
|
self.capacity = capacity
|
||||||
|
|
||||||
|
def get(self, key):
|
||||||
|
"""
|
||||||
|
Returns the value associated to `key` if it exists,
|
||||||
|
updating the cache usage.
|
||||||
|
Raises `KeyError` if `key doesn't exist in the cache.
|
||||||
|
"""
|
||||||
|
self._cache.move_to_end(key)
|
||||||
|
return self._cache[key]
|
||||||
|
|
||||||
|
def put(self, key, value) -> None:
|
||||||
|
"""
|
||||||
|
Assigns `value` to `key`, overwriting `key` if it already exists
|
||||||
|
in the cache and updating the cache usage.
|
||||||
|
If the size of the cache grows above capacity, pops the least used
|
||||||
|
element.
|
||||||
|
"""
|
||||||
|
self._cache[key] = value
|
||||||
|
self._cache.move_to_end(key)
|
||||||
|
if len(self._cache) > self.capacity:
|
||||||
|
self._cache.popitem(last=False)
|
||||||
|
|
||||||
|
def __contains__(self, key) -> bool:
|
||||||
|
if key not in self._cache:
|
||||||
|
return False
|
||||||
|
self._cache.move_to_end(key)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._cache)
|
||||||
@ -28,10 +28,13 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
|||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.helpers import get_formatted_entity_name
|
from metadata.utils.helpers import get_formatted_entity_name
|
||||||
from metadata.utils.logger import utils_logger
|
from metadata.utils.logger import utils_logger
|
||||||
|
from metadata.utils.lru_cache import LRUCache
|
||||||
|
|
||||||
logger = utils_logger()
|
logger = utils_logger()
|
||||||
column_lineage_map = {}
|
column_lineage_map = {}
|
||||||
|
|
||||||
|
LRU_CACHE_SIZE = 4096
|
||||||
|
|
||||||
|
|
||||||
def split_raw_table_name(database: str, raw_name: str) -> dict:
|
def split_raw_table_name(database: str, raw_name: str) -> dict:
|
||||||
database_schema = None
|
database_schema = None
|
||||||
@ -53,6 +56,9 @@ def get_column_fqn(table_entity: Table, column: str) -> Optional[str]:
|
|||||||
return tbl_column.fullyQualifiedName.__root__
|
return tbl_column.fullyQualifiedName.__root__
|
||||||
|
|
||||||
|
|
||||||
|
search_cache = LRUCache(LRU_CACHE_SIZE)
|
||||||
|
|
||||||
|
|
||||||
def search_table_entities(
|
def search_table_entities(
|
||||||
metadata: OpenMetadata,
|
metadata: OpenMetadata,
|
||||||
service_name: str,
|
service_name: str,
|
||||||
@ -65,6 +71,10 @@ def search_table_entities(
|
|||||||
It uses ES to build the FQN if we miss some info and will run
|
It uses ES to build the FQN if we miss some info and will run
|
||||||
a request against the API to find the Entity.
|
a request against the API to find the Entity.
|
||||||
"""
|
"""
|
||||||
|
search_tuple = (service_name, database, database_schema, table)
|
||||||
|
if search_tuple in search_cache:
|
||||||
|
return search_cache.get(search_tuple)
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
table_fqns = fqn.build(
|
table_fqns = fqn.build(
|
||||||
metadata,
|
metadata,
|
||||||
@ -80,6 +90,7 @@ def search_table_entities(
|
|||||||
table_entity: Table = metadata.get_by_name(Table, fqn=table_fqn)
|
table_entity: Table = metadata.get_by_name(Table, fqn=table_fqn)
|
||||||
if table_entity:
|
if table_entity:
|
||||||
table_entities.append(table_entity)
|
table_entities.append(table_entity)
|
||||||
|
search_cache.put(search_tuple, table_entities)
|
||||||
return table_entities
|
return table_entities
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
@ -176,7 +187,6 @@ def _create_lineage_by_table_name(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
from_table_entities = get_table_entities_from_query(
|
from_table_entities = get_table_entities_from_query(
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
|
|||||||
50
ingestion/tests/unit/metadata/utils/test_lru_cache.py
Normal file
50
ingestion/tests/unit/metadata/utils/test_lru_cache.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
"""Tests for the LRU cache class"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from metadata.utils.lru_cache import LRUCache
|
||||||
|
|
||||||
|
|
||||||
|
class TestLRUCache:
|
||||||
|
def test_create_cache(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
cache.put(1, 1)
|
||||||
|
|
||||||
|
def test_get_fails_if_key_doesnt_exist(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
cache.get(1)
|
||||||
|
|
||||||
|
def test_putting_an_element_increases_cache_size(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
assert len(cache) == 0
|
||||||
|
cache.put(1, None)
|
||||||
|
cache.put(2, None)
|
||||||
|
assert len(cache) == 2
|
||||||
|
|
||||||
|
def test_contains_determines_if_an_element_exists(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
cache.put(1, 1)
|
||||||
|
assert 1 in cache
|
||||||
|
assert 2 not in cache
|
||||||
|
|
||||||
|
def test_putting_over_capacity_rotates_cache(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
cache.put(1, None)
|
||||||
|
cache.put(2, None)
|
||||||
|
cache.put(3, None)
|
||||||
|
assert 1 not in cache
|
||||||
|
|
||||||
|
def test_interacting_with_a_key_makes_it_used(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
cache.put(1, None)
|
||||||
|
cache.put(2, None)
|
||||||
|
1 in cache
|
||||||
|
cache.put(3, None)
|
||||||
|
assert 1 in cache
|
||||||
|
assert 2 not in cache
|
||||||
|
|
||||||
|
def test_getting_an_existing_key_returns_the_associated_element(self) -> None:
|
||||||
|
cache = LRUCache(2)
|
||||||
|
cache.put(1, 2)
|
||||||
|
assert cache.get(1) == 2
|
||||||
Loading…
x
Reference in New Issue
Block a user