MINOR: Improve threading for lineage (#20668)

2025-11-13 17:32:53 +00:00 · 2025-04-07 18:31:52 +05:30 · 2025-04-07 18:31:52 +05:30 · b7d43e7ee2
commit b7d43e7ee2
parent f7c4cc54f4
4 changed files with 129 additions and 56 deletions
--- a/ingestion/src/metadata/ingestion/lineage/parser.py
+++ b/ingestion/src/metadata/ingestion/lineage/parser.py
@ -447,8 +447,14 @@ class LineageParser:
        if lr_sqlfluff:
            return lr_sqlfluff
-        lr_sqlparser = LineageRunner(query)
+        @timeout(seconds=timeout_seconds)
        def get_sqlparser_lineage_runner(qry: str) -> LineageRunner:
            lr_sqlparser = LineageRunner(qry)
            lr_sqlparser.get_column_lineage()
            return lr_sqlparser
        try:
            lr_sqlparser = get_sqlparser_lineage_runner(query)
            _ = len(lr_sqlparser.get_column_lineage()) + len(
                set(lr_sqlparser.source_tables).union(
                    set(lr_sqlparser.target_tables).union(
@ -456,6 +462,13 @@ class LineageParser:
                    )
                )
            )
        except TimeoutError:
            self.query_parsing_success = False
            self.query_parsing_failure_reason = (
                f"Lineage with SqlParser failed for the [{dialect.value}]. "
                f"Parser has been running for more than {timeout_seconds} seconds."
            )
            return None
        except Exception:
            # if both runner have failed we return the usual one
            logger.debug(f"Failed to parse query with sqlparse & sqlfluff: {query}")
--- a/ingestion/src/metadata/ingestion/ometa/mixins/lineage_mixin.py
+++ b/ingestion/src/metadata/ingestion/ometa/mixins/lineage_mixin.py
@ -14,6 +14,7 @@ Mixin class containing Lineage specific methods
 To be used by OpenMetadata class
 """
 import functools
 import json
 import traceback
 from copy import deepcopy
 from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
@ -407,22 +408,26 @@ class OMetaLineageMixin(Generic[T]):
                        f"Error while adding lineage: {lineage_request.left.error}"
                    )
    @functools.lru_cache(maxsize=LRU_CACHE_SIZE)
    def patch_lineage_processed_flag(
        self,
        entity: Type[T],
        fqn: str,
    ) -> None:
-
+        """
        Patch the processed lineage flag for an entity
        """
        try:
-            original_entity = self.get_by_name(entity=entity, fqn=fqn)
+            patch = [
-            if not original_entity:
+                {
-                return
+                    "op": "add",
-
+                    "path": "/processedLineage",
-            updated_entity = original_entity.model_copy(deep=True)
+                    "value": True,
-            updated_entity.processedLineage = True
+                }
-
+            ]
-            self.patch(
+            self.client.patch(
-                entity=entity, source=original_entity, destination=updated_entity
+                path=f"{self.get_suffix(entity)}/name/{fqn}",
                data=json.dumps(patch),
            )
        except Exception as exc:
            logger.debug(f"Error while patching lineage processed flag: {exc}")
--- a/ingestion/src/metadata/ingestion/source/database/lineage_source.py
+++ b/ingestion/src/metadata/ingestion/source/database/lineage_source.py
@ -13,10 +13,10 @@ Lineage Source Module
 """
 import csv
 import os
 import threading
 import time
 import traceback
 from abc import ABC
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from typing import Any, Callable, Iterable, Iterator, List, Optional, Union
@ -54,9 +54,9 @@ from metadata.utils.logger import ingestion_logger
 logger = ingestion_logger()
-CHUNK_SIZE = 200
+CHUNK_SIZE = 100
-THREAD_TIMEOUT = 600
+THREAD_TIMEOUT = 3 * 60 * 10  # 30 minutes in seconds
 class LineageSource(QueryParserSource, ABC):
@ -119,61 +119,95 @@ class LineageSource(QueryParserSource, ABC):
            )
            yield from self.yield_table_query()
    @staticmethod
    def generate_lineage_in_thread(
        self,
        producer_fn: Callable[[], Iterable[Any]],
-        processor_fn: Callable[[Any], Iterable[Any]],
+        processor_fn: Callable[[Any, Queue], None],
        chunk_size: int = CHUNK_SIZE,
        thread_timeout: int = THREAD_TIMEOUT,
        max_threads: int = 10,  # Default maximum number of concurrent threads
    ):
        """
-        Optimized multithreaded lineage generation with improved error handling and performance.
+        Process data in separate daemon threads with timeout control.
        Args:
-            producer_fn: Function that yields input items
+            producer_fn: Function that yields data chunks
-            processor_fn: Function to process each input item
+            processor_fn: Function that processes data and adds results to the queue
-            chunk_size: Optional batching to reduce thread creation overhead
+            chunk_size: Size of chunks to process
            thread_timeout: Maximum time in seconds to wait for a processor thread
            max_threads: Maximum number of concurrent threads to run
        """
        def chunk_generator():
            temp_chunk = []
            for chunk in producer_fn():
                temp_chunk.append(chunk)
                if len(temp_chunk) >= chunk_size:
                    yield temp_chunk
                    temp_chunk = []
            if temp_chunk:
                yield temp_chunk
        thread_pool = ThreadPoolExecutor(max_workers=self.source_config.threads)
        queue = Queue()
        active_threads = []
        def process_chunk(chunk):
            """Process a chunk of data in a thread."""
            try:
                processor_fn(chunk, queue)
            except Exception as e:
                logger.error(f"Error processing chunk: {e}")
                logger.debug(traceback.format_exc())
        # Create an iterator for the chunks but don't consume it all at once
        chunk_iterator = iter(chunk_generator(producer_fn, chunk_size))
        # Process results from the queue and check for timed-out threads
        chunk_processed = False  # Flag to track if all chunks have been processed
        ignored_threads = 0
        futures = [
            thread_pool.submit(
                processor_fn,
                chunk,
                queue,
            )
            for chunk in chunk_generator()
        ]
        while True:
            # Start new threads until we reach the max_threads limit
            while (
                len(active_threads) + ignored_threads
            ) < max_threads and not chunk_processed:
                try:
                    # Only fetch a new chunk when we're ready to create a thread
                    chunk = next(chunk_iterator)
                    thread = threading.Thread(target=process_chunk, args=(chunk,))
                    thread.start_time = time.time()  # Track when the thread started
                    thread.daemon = True
                    active_threads.append(thread)
                    thread.start()
                except StopIteration:
                    # No more chunks to process
                    chunk_processed = True
                    break
            if ignored_threads == max_threads:
                logger.warning(f"Max threads reached, skipping remaining threads")
                break
            # Process any available results
            if queue.has_tasks():
                yield from queue.process()
            # Check for completed or timed-out threads
            still_active = []
            for thread in active_threads:
                if thread.is_alive():
                    # Check if the thread has timed out
                    if time.time() - thread.start_time > thread_timeout:
                        logger.warning(
                            f"Thread {thread.name} timed out after {thread_timeout}s"
                        )
                        ignored_threads += 1
                    else:
-                if not futures:
+                        still_active.append(thread)
                # If thread is not alive, it has completed normally
            active_threads = still_active
            # Exit conditions: no more active threads and no more chunks to process
            if not active_threads and chunk_processed:
                break
-                for i, future in enumerate(futures):
+            # Small pause to prevent CPU spinning
-                    if future.done():
+            if active_threads:
-                        try:
+                time.sleep(0.1)
                            future.result(timeout=THREAD_TIMEOUT)
                        except Exception as e:
                            logger.debug(f"Error in future: {e}")
                            logger.debug(traceback.format_exc())
                        futures.pop(i)
-            time.sleep(0.01)
+        # Final check for any remaining results
        while queue.has_tasks():
            yield from queue.process()
    def yield_table_query(self) -> Iterator[TableQuery]:
        """
@ -269,7 +303,9 @@ class LineageSource(QueryParserSource, ABC):
        producer_fn = self.get_table_query
        processor_fn = self.query_lineage_generator
        yield from self.generate_lineage_in_thread(
-            producer_fn, processor_fn, CHUNK_SIZE
+            producer_fn,
            processor_fn,
            max_threads=self.source_config.threads,
        )
    def view_lineage_generator(
@ -318,7 +354,9 @@ class LineageSource(QueryParserSource, ABC):
            self.source_config.incrementalLineageProcessing,
        )
        processor_fn = self.view_lineage_generator
-        yield from self.generate_lineage_in_thread(producer_fn, processor_fn)
+        yield from self.generate_lineage_in_thread(
            producer_fn, processor_fn, max_threads=self.source_config.threads
        )
    def yield_procedure_lineage(
        self,
@ -412,3 +450,18 @@ class LineageSource(QueryParserSource, ABC):
            and self.source_config.crossDatabaseServiceNames
        ):
            yield from self.yield_cross_database_lineage() or []
 def chunk_generator(producer_fn, chunk_size):
    """
    Group items from producer into chunks of specified size.
    This is a separate function to allow for better lazy evaluation.
    """
    temp_chunk = []
    for item in producer_fn():
        temp_chunk.append(item)
        if len(temp_chunk) >= chunk_size:
            yield temp_chunk
            temp_chunk = []
    if temp_chunk:
        yield temp_chunk
--- a/ingestion/src/metadata/ingestion/source/database/stored_procedures_mixin.py
+++ b/ingestion/src/metadata/ingestion/source/database/stored_procedures_mixin.py
@ -304,4 +304,6 @@ class StoredProcedureLineageMixin(ABC):
        logger.info("Processing Lineage for Stored Procedures")
        producer_fn = self.procedure_lineage_generator
        processor_fn = self.procedure_lineage_processor
-        yield from self.generate_lineage_in_thread(producer_fn, processor_fn)
+        yield from self.generate_lineage_in_thread(
            producer_fn, processor_fn, max_threads=self.source_config.threads
        )