fix(ingest): run sqllineage in process by default (#11650)

2025-12-12 10:35:51 +00:00 · 2024-10-16 20:47:48 -07:00 · 2024-10-16 20:47:48 -07:00 · 8b42ac8cde
commit 8b42ac8cde
parent b8144699fd
5 changed files with 13 additions and 2883 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py
@ -124,9 +124,6 @@ class LookMLSourceConfig(
        description="List of regex patterns for LookML views to include in the extraction.",
    )
    parse_table_names_from_sql: bool = Field(True, description="See note below.")
-    sql_parser: str = Field(
-        "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below."
-    )
    api: Optional[LookerAPIConfig]
    project_name: Optional[str] = Field(
        None,
--- a/metadata-ingestion/src/datahub/ingestion/source/redash.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py
@ -2,7 +2,6 @@ import logging
 import math
 import sys
 from dataclasses import dataclass, field
-from multiprocessing.pool import ThreadPool
 from typing import Dict, Iterable, List, Optional, Set, Type

 import dateutil.parser as dp
@ -43,6 +42,7 @@ from datahub.metadata.schema_classes import (
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.sql_parser import SQLParser
+from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@ -646,11 +646,11 @@ class RedashSource(Source):
        self.report.total_dashboards = total_dashboards
        self.report.max_page_dashboards = max_page

-        dash_exec_pool = ThreadPool(self.config.parallelism)
-        for response in dash_exec_pool.imap_unordered(
-            self._process_dashboard_response, range(1, max_page + 1)
-        ):
-            yield from response
+        yield from ThreadedIteratorExecutor.process(
+            self._process_dashboard_response,
+            [(page,) for page in range(1, max_page + 1)],
+            max_workers=self.config.parallelism,
+        )

    def _get_chart_type_from_viz_data(self, viz_data: Dict) -> str:
        """
@ -769,11 +769,12 @@ class RedashSource(Source):
        logger.info(f"/api/queries total count {total_queries} and max page {max_page}")
        self.report.total_queries = total_queries
        self.report.max_page_queries = max_page
-        chart_exec_pool = ThreadPool(self.config.parallelism)
-        for response in chart_exec_pool.imap_unordered(
-            self._process_query_response, range(1, max_page + 1)
-        ):
-            yield from response
+
+        yield from ThreadedIteratorExecutor.process(
+            self._process_query_response,
+            [(page,) for page in range(1, max_page + 1)],
+            max_workers=self.config.parallelism,
+        )

    def add_config_to_report(self) -> None:
        self.report.api_page_limit = self.config.api_page_limit
--- a/metadata-ingestion/src/datahub/utilities/sql_parser.py
+++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py
@ -46,7 +46,7 @@ class SqlLineageSQLParser(SQLParser):
    def __init__(
        self,
        sql_query: str,
-        use_external_process: bool = True,
+        use_external_process: bool = False,
        use_raw_names: bool = False,
    ) -> None:
        super().__init__(sql_query, use_external_process)
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
--- a/metadata-ingestion/tests/integration/lookml/test_lookml.py
+++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py
@ -10,7 +10,6 @@ from deepdiff import DeepDiff
 from freezegun import freeze_time
 from looker_sdk.sdk.api40.models import DBConnection

-from datahub.configuration.common import PipelineExecutionError
 from datahub.ingestion.run.pipeline import Pipeline
 from datahub.ingestion.source.file import read_metadata_file
 from datahub.ingestion.source.looker.looker_template_language import (
@ -518,53 +517,6 @@ def ingestion_test(
        )


-@freeze_time(FROZEN_TIME)
-def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time):
-    """Incorrect specification of sql parser should not fail ingestion"""
-    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
-    mce_out = "lookml_mces_badsql_parser.json"
-    pipeline = Pipeline.create(
-        {
-            "run_id": "lookml-test",
-            "source": {
-                "type": "lookml",
-                "config": {
-                    "base_folder": str(test_resources_dir / "lkml_samples"),
-                    "connection_to_platform_map": {
-                        "my_connection": {
-                            "platform": "snowflake",
-                            "default_db": "default_db",
-                            "default_schema": "default_schema",
-                        }
-                    },
-                    "parse_table_names_from_sql": True,
-                    "project_name": "lkml_samples",
-                    "sql_parser": "bad.sql.Parser",
-                    "emit_reachable_views_only": False,
-                    "process_refinements": False,
-                },
-            },
-            "sink": {
-                "type": "file",
-                "config": {
-                    "filename": f"{tmp_path}/{mce_out}",
-                },
-            },
-        }
-    )
-    pipeline.run()
-    pipeline.pretty_print_summary()
-    pipeline.raise_from_status(raise_warnings=False)
-    with pytest.raises(PipelineExecutionError):  # we expect the source to have warnings
-        pipeline.raise_from_status(raise_warnings=True)
-
-    mce_helpers.check_golden_file(
-        pytestconfig,
-        output_path=tmp_path / mce_out,
-        golden_path=test_resources_dir / mce_out,
-    )
-
-
@freeze_time(FROZEN_TIME)
 def test_lookml_git_info(pytestconfig, tmp_path, mock_time):
    """Add github info to config"""