mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-13 17:32:53 +00:00
MINOR: Improve threading for lineage (#20668)
This commit is contained in:
parent
f7c4cc54f4
commit
b7d43e7ee2
@ -447,8 +447,14 @@ class LineageParser:
|
|||||||
if lr_sqlfluff:
|
if lr_sqlfluff:
|
||||||
return lr_sqlfluff
|
return lr_sqlfluff
|
||||||
|
|
||||||
lr_sqlparser = LineageRunner(query)
|
@timeout(seconds=timeout_seconds)
|
||||||
|
def get_sqlparser_lineage_runner(qry: str) -> LineageRunner:
|
||||||
|
lr_sqlparser = LineageRunner(qry)
|
||||||
|
lr_sqlparser.get_column_lineage()
|
||||||
|
return lr_sqlparser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
lr_sqlparser = get_sqlparser_lineage_runner(query)
|
||||||
_ = len(lr_sqlparser.get_column_lineage()) + len(
|
_ = len(lr_sqlparser.get_column_lineage()) + len(
|
||||||
set(lr_sqlparser.source_tables).union(
|
set(lr_sqlparser.source_tables).union(
|
||||||
set(lr_sqlparser.target_tables).union(
|
set(lr_sqlparser.target_tables).union(
|
||||||
@ -456,6 +462,13 @@ class LineageParser:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
except TimeoutError:
|
||||||
|
self.query_parsing_success = False
|
||||||
|
self.query_parsing_failure_reason = (
|
||||||
|
f"Lineage with SqlParser failed for the [{dialect.value}]. "
|
||||||
|
f"Parser has been running for more than {timeout_seconds} seconds."
|
||||||
|
)
|
||||||
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
# if both runner have failed we return the usual one
|
# if both runner have failed we return the usual one
|
||||||
logger.debug(f"Failed to parse query with sqlparse & sqlfluff: {query}")
|
logger.debug(f"Failed to parse query with sqlparse & sqlfluff: {query}")
|
||||||
|
|||||||
@ -14,6 +14,7 @@ Mixin class containing Lineage specific methods
|
|||||||
To be used by OpenMetadata class
|
To be used by OpenMetadata class
|
||||||
"""
|
"""
|
||||||
import functools
|
import functools
|
||||||
|
import json
|
||||||
import traceback
|
import traceback
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
|
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
|
||||||
@ -407,22 +408,26 @@ class OMetaLineageMixin(Generic[T]):
|
|||||||
f"Error while adding lineage: {lineage_request.left.error}"
|
f"Error while adding lineage: {lineage_request.left.error}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=LRU_CACHE_SIZE)
|
||||||
def patch_lineage_processed_flag(
|
def patch_lineage_processed_flag(
|
||||||
self,
|
self,
|
||||||
entity: Type[T],
|
entity: Type[T],
|
||||||
fqn: str,
|
fqn: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
Patch the processed lineage flag for an entity
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
original_entity = self.get_by_name(entity=entity, fqn=fqn)
|
patch = [
|
||||||
if not original_entity:
|
{
|
||||||
return
|
"op": "add",
|
||||||
|
"path": "/processedLineage",
|
||||||
updated_entity = original_entity.model_copy(deep=True)
|
"value": True,
|
||||||
updated_entity.processedLineage = True
|
}
|
||||||
|
]
|
||||||
self.patch(
|
self.client.patch(
|
||||||
entity=entity, source=original_entity, destination=updated_entity
|
path=f"{self.get_suffix(entity)}/name/{fqn}",
|
||||||
|
data=json.dumps(patch),
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug(f"Error while patching lineage processed flag: {exc}")
|
logger.debug(f"Error while patching lineage processed flag: {exc}")
|
||||||
|
|||||||
@ -13,10 +13,10 @@ Lineage Source Module
|
|||||||
"""
|
"""
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Callable, Iterable, Iterator, List, Optional, Union
|
from typing import Any, Callable, Iterable, Iterator, List, Optional, Union
|
||||||
|
|
||||||
@ -54,9 +54,9 @@ from metadata.utils.logger import ingestion_logger
|
|||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
|
||||||
|
|
||||||
CHUNK_SIZE = 200
|
CHUNK_SIZE = 100
|
||||||
|
|
||||||
THREAD_TIMEOUT = 600
|
THREAD_TIMEOUT = 3 * 60 * 10 # 30 minutes in seconds
|
||||||
|
|
||||||
|
|
||||||
class LineageSource(QueryParserSource, ABC):
|
class LineageSource(QueryParserSource, ABC):
|
||||||
@ -119,61 +119,95 @@ class LineageSource(QueryParserSource, ABC):
|
|||||||
)
|
)
|
||||||
yield from self.yield_table_query()
|
yield from self.yield_table_query()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def generate_lineage_in_thread(
|
def generate_lineage_in_thread(
|
||||||
self,
|
|
||||||
producer_fn: Callable[[], Iterable[Any]],
|
producer_fn: Callable[[], Iterable[Any]],
|
||||||
processor_fn: Callable[[Any], Iterable[Any]],
|
processor_fn: Callable[[Any, Queue], None],
|
||||||
chunk_size: int = CHUNK_SIZE,
|
chunk_size: int = CHUNK_SIZE,
|
||||||
|
thread_timeout: int = THREAD_TIMEOUT,
|
||||||
|
max_threads: int = 10, # Default maximum number of concurrent threads
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Optimized multithreaded lineage generation with improved error handling and performance.
|
Process data in separate daemon threads with timeout control.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
producer_fn: Function that yields input items
|
producer_fn: Function that yields data chunks
|
||||||
processor_fn: Function to process each input item
|
processor_fn: Function that processes data and adds results to the queue
|
||||||
chunk_size: Optional batching to reduce thread creation overhead
|
chunk_size: Size of chunks to process
|
||||||
|
thread_timeout: Maximum time in seconds to wait for a processor thread
|
||||||
|
max_threads: Maximum number of concurrent threads to run
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def chunk_generator():
|
|
||||||
temp_chunk = []
|
|
||||||
for chunk in producer_fn():
|
|
||||||
temp_chunk.append(chunk)
|
|
||||||
if len(temp_chunk) >= chunk_size:
|
|
||||||
yield temp_chunk
|
|
||||||
temp_chunk = []
|
|
||||||
|
|
||||||
if temp_chunk:
|
|
||||||
yield temp_chunk
|
|
||||||
|
|
||||||
thread_pool = ThreadPoolExecutor(max_workers=self.source_config.threads)
|
|
||||||
queue = Queue()
|
queue = Queue()
|
||||||
|
active_threads = []
|
||||||
|
|
||||||
|
def process_chunk(chunk):
|
||||||
|
"""Process a chunk of data in a thread."""
|
||||||
|
try:
|
||||||
|
processor_fn(chunk, queue)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing chunk: {e}")
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
|
# Create an iterator for the chunks but don't consume it all at once
|
||||||
|
chunk_iterator = iter(chunk_generator(producer_fn, chunk_size))
|
||||||
|
|
||||||
|
# Process results from the queue and check for timed-out threads
|
||||||
|
chunk_processed = False # Flag to track if all chunks have been processed
|
||||||
|
ignored_threads = 0
|
||||||
|
|
||||||
futures = [
|
|
||||||
thread_pool.submit(
|
|
||||||
processor_fn,
|
|
||||||
chunk,
|
|
||||||
queue,
|
|
||||||
)
|
|
||||||
for chunk in chunk_generator()
|
|
||||||
]
|
|
||||||
while True:
|
while True:
|
||||||
|
# Start new threads until we reach the max_threads limit
|
||||||
|
while (
|
||||||
|
len(active_threads) + ignored_threads
|
||||||
|
) < max_threads and not chunk_processed:
|
||||||
|
try:
|
||||||
|
# Only fetch a new chunk when we're ready to create a thread
|
||||||
|
chunk = next(chunk_iterator)
|
||||||
|
thread = threading.Thread(target=process_chunk, args=(chunk,))
|
||||||
|
thread.start_time = time.time() # Track when the thread started
|
||||||
|
thread.daemon = True
|
||||||
|
active_threads.append(thread)
|
||||||
|
thread.start()
|
||||||
|
except StopIteration:
|
||||||
|
# No more chunks to process
|
||||||
|
chunk_processed = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if ignored_threads == max_threads:
|
||||||
|
logger.warning(f"Max threads reached, skipping remaining threads")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Process any available results
|
||||||
if queue.has_tasks():
|
if queue.has_tasks():
|
||||||
yield from queue.process()
|
yield from queue.process()
|
||||||
|
|
||||||
|
# Check for completed or timed-out threads
|
||||||
|
still_active = []
|
||||||
|
for thread in active_threads:
|
||||||
|
if thread.is_alive():
|
||||||
|
# Check if the thread has timed out
|
||||||
|
if time.time() - thread.start_time > thread_timeout:
|
||||||
|
logger.warning(
|
||||||
|
f"Thread {thread.name} timed out after {thread_timeout}s"
|
||||||
|
)
|
||||||
|
ignored_threads += 1
|
||||||
else:
|
else:
|
||||||
if not futures:
|
still_active.append(thread)
|
||||||
|
# If thread is not alive, it has completed normally
|
||||||
|
|
||||||
|
active_threads = still_active
|
||||||
|
|
||||||
|
# Exit conditions: no more active threads and no more chunks to process
|
||||||
|
if not active_threads and chunk_processed:
|
||||||
break
|
break
|
||||||
|
|
||||||
for i, future in enumerate(futures):
|
# Small pause to prevent CPU spinning
|
||||||
if future.done():
|
if active_threads:
|
||||||
try:
|
time.sleep(0.1)
|
||||||
future.result(timeout=THREAD_TIMEOUT)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error in future: {e}")
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
futures.pop(i)
|
|
||||||
|
|
||||||
time.sleep(0.01)
|
# Final check for any remaining results
|
||||||
|
while queue.has_tasks():
|
||||||
|
yield from queue.process()
|
||||||
|
|
||||||
def yield_table_query(self) -> Iterator[TableQuery]:
|
def yield_table_query(self) -> Iterator[TableQuery]:
|
||||||
"""
|
"""
|
||||||
@ -269,7 +303,9 @@ class LineageSource(QueryParserSource, ABC):
|
|||||||
producer_fn = self.get_table_query
|
producer_fn = self.get_table_query
|
||||||
processor_fn = self.query_lineage_generator
|
processor_fn = self.query_lineage_generator
|
||||||
yield from self.generate_lineage_in_thread(
|
yield from self.generate_lineage_in_thread(
|
||||||
producer_fn, processor_fn, CHUNK_SIZE
|
producer_fn,
|
||||||
|
processor_fn,
|
||||||
|
max_threads=self.source_config.threads,
|
||||||
)
|
)
|
||||||
|
|
||||||
def view_lineage_generator(
|
def view_lineage_generator(
|
||||||
@ -318,7 +354,9 @@ class LineageSource(QueryParserSource, ABC):
|
|||||||
self.source_config.incrementalLineageProcessing,
|
self.source_config.incrementalLineageProcessing,
|
||||||
)
|
)
|
||||||
processor_fn = self.view_lineage_generator
|
processor_fn = self.view_lineage_generator
|
||||||
yield from self.generate_lineage_in_thread(producer_fn, processor_fn)
|
yield from self.generate_lineage_in_thread(
|
||||||
|
producer_fn, processor_fn, max_threads=self.source_config.threads
|
||||||
|
)
|
||||||
|
|
||||||
def yield_procedure_lineage(
|
def yield_procedure_lineage(
|
||||||
self,
|
self,
|
||||||
@ -412,3 +450,18 @@ class LineageSource(QueryParserSource, ABC):
|
|||||||
and self.source_config.crossDatabaseServiceNames
|
and self.source_config.crossDatabaseServiceNames
|
||||||
):
|
):
|
||||||
yield from self.yield_cross_database_lineage() or []
|
yield from self.yield_cross_database_lineage() or []
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_generator(producer_fn, chunk_size):
|
||||||
|
"""
|
||||||
|
Group items from producer into chunks of specified size.
|
||||||
|
This is a separate function to allow for better lazy evaluation.
|
||||||
|
"""
|
||||||
|
temp_chunk = []
|
||||||
|
for item in producer_fn():
|
||||||
|
temp_chunk.append(item)
|
||||||
|
if len(temp_chunk) >= chunk_size:
|
||||||
|
yield temp_chunk
|
||||||
|
temp_chunk = []
|
||||||
|
if temp_chunk:
|
||||||
|
yield temp_chunk
|
||||||
|
|||||||
@ -304,4 +304,6 @@ class StoredProcedureLineageMixin(ABC):
|
|||||||
logger.info("Processing Lineage for Stored Procedures")
|
logger.info("Processing Lineage for Stored Procedures")
|
||||||
producer_fn = self.procedure_lineage_generator
|
producer_fn = self.procedure_lineage_generator
|
||||||
processor_fn = self.procedure_lineage_processor
|
processor_fn = self.procedure_lineage_processor
|
||||||
yield from self.generate_lineage_in_thread(producer_fn, processor_fn)
|
yield from self.generate_lineage_in_thread(
|
||||||
|
producer_fn, processor_fn, max_threads=self.source_config.threads
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user