mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	feat(ingestion): Enhanced column lineage extraction for Looker/LookML (#14826)
This commit is contained in:
		
							parent
							
								
									50c5841b50
								
							
						
					
					
						commit
						c18b125a05
					
				| @ -307,6 +307,12 @@ class ViewFieldType(Enum): | |||||||
|     UNKNOWN = "Unknown" |     UNKNOWN = "Unknown" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class ViewFieldDimensionGroupType(Enum): | ||||||
|  |     # Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group | ||||||
|  |     TIME = "time" | ||||||
|  |     DURATION = "duration" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class ViewFieldValue(Enum): | class ViewFieldValue(Enum): | ||||||
|     NOT_AVAILABLE = "NotAvailable" |     NOT_AVAILABLE = "NotAvailable" | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -11,3 +11,7 @@ prod = "prod" | |||||||
| dev = "dev" | dev = "dev" | ||||||
| NAME = "name" | NAME = "name" | ||||||
| DERIVED_DOT_SQL = "derived.sql" | DERIVED_DOT_SQL = "derived.sql" | ||||||
|  | 
 | ||||||
|  | VIEW_FIELD_TYPE_ATTRIBUTE = "type" | ||||||
|  | VIEW_FIELD_INTERVALS_ATTRIBUTE = "intervals" | ||||||
|  | VIEW_FIELD_TIMEFRAMES_ATTRIBUTE = "timeframes" | ||||||
|  | |||||||
| @ -2,6 +2,7 @@ | |||||||
| import json | import json | ||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
|  | from enum import Enum | ||||||
| from functools import lru_cache | from functools import lru_cache | ||||||
| from typing import Dict, List, MutableMapping, Optional, Sequence, Set, Union, cast | from typing import Dict, List, MutableMapping, Optional, Sequence, Set, Union, cast | ||||||
| 
 | 
 | ||||||
| @ -31,6 +32,14 @@ from datahub.configuration.common import ConfigurationError | |||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class LookerQueryResponseFormat(Enum): | ||||||
|  |     # result_format - Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query | ||||||
|  |     JSON = "json" | ||||||
|  |     SQL = ( | ||||||
|  |         "sql"  # Note: This does not execute the query, it only generates the SQL query. | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class TransportOptionsConfig(ConfigModel): | class TransportOptionsConfig(ConfigModel): | ||||||
|     timeout: int |     timeout: int | ||||||
|     headers: MutableMapping[str, str] |     headers: MutableMapping[str, str] | ||||||
| @ -69,6 +78,7 @@ class LookerAPIStats(BaseModel): | |||||||
|     search_looks_calls: int = 0 |     search_looks_calls: int = 0 | ||||||
|     search_dashboards_calls: int = 0 |     search_dashboards_calls: int = 0 | ||||||
|     all_user_calls: int = 0 |     all_user_calls: int = 0 | ||||||
|  |     generate_sql_query_calls: int = 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class LookerAPI: | class LookerAPI: | ||||||
| @ -170,17 +180,40 @@ class LookerAPI: | |||||||
|         logger.debug(f"Executing query {write_query}") |         logger.debug(f"Executing query {write_query}") | ||||||
|         self.client_stats.query_calls += 1 |         self.client_stats.query_calls += 1 | ||||||
| 
 | 
 | ||||||
|         response_json = self.client.run_inline_query( |         response = self.client.run_inline_query( | ||||||
|             result_format="json", |             result_format=LookerQueryResponseFormat.JSON.value, | ||||||
|             body=write_query, |             body=write_query, | ||||||
|             transport_options=self.transport_options, |             transport_options=self.transport_options, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  |         data = json.loads(response) | ||||||
|  | 
 | ||||||
|         logger.debug("=================Response=================") |         logger.debug("=================Response=================") | ||||||
|         data = json.loads(response_json) |  | ||||||
|         logger.debug("Length of response: %d", len(data)) |         logger.debug("Length of response: %d", len(data)) | ||||||
|         return data |         return data | ||||||
| 
 | 
 | ||||||
|  |     def generate_sql_query( | ||||||
|  |         self, write_query: WriteQuery, use_cache: bool = False | ||||||
|  |     ) -> str: | ||||||
|  |         """ | ||||||
|  |         Generates a SQL query string for a given WriteQuery. | ||||||
|  | 
 | ||||||
|  |         Note: This does not execute the query, it only generates the SQL query. | ||||||
|  |         """ | ||||||
|  |         logger.debug(f"Generating SQL query for {write_query}") | ||||||
|  |         self.client_stats.generate_sql_query_calls += 1 | ||||||
|  | 
 | ||||||
|  |         response = self.client.run_inline_query( | ||||||
|  |             result_format=LookerQueryResponseFormat.SQL.value, | ||||||
|  |             body=write_query, | ||||||
|  |             transport_options=self.transport_options, | ||||||
|  |             cache=use_cache, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         logger.debug("=================Response=================") | ||||||
|  |         logger.debug("Length of SQL response: %d", len(response)) | ||||||
|  |         return str(response) | ||||||
|  | 
 | ||||||
|     def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboard: |     def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboard: | ||||||
|         self.client_stats.dashboard_calls += 1 |         self.client_stats.dashboard_calls += 1 | ||||||
|         return self.client.dashboard( |         return self.client.dashboard( | ||||||
|  | |||||||
| @ -3,11 +3,11 @@ from typing import Dict, List, Optional | |||||||
| 
 | 
 | ||||||
| from datahub.ingestion.source.looker.looker_common import LookerViewId, ViewFieldValue | from datahub.ingestion.source.looker.looker_common import LookerViewId, ViewFieldValue | ||||||
| from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | ||||||
|  | from datahub.ingestion.source.looker.looker_constant import NAME | ||||||
| from datahub.ingestion.source.looker.looker_dataclasses import LookerModel | from datahub.ingestion.source.looker.looker_dataclasses import LookerModel | ||||||
| from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | ||||||
| from datahub.ingestion.source.looker.lookml_config import ( | from datahub.ingestion.source.looker.lookml_config import ( | ||||||
|     BASE_PROJECT_NAME, |     BASE_PROJECT_NAME, | ||||||
|     NAME, |  | ||||||
|     LookMLSourceReport, |     LookMLSourceReport, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -12,12 +12,12 @@ from datahub.ingestion.source.looker.looker_constant import ( | |||||||
|     DIMENSION_GROUPS, |     DIMENSION_GROUPS, | ||||||
|     DIMENSIONS, |     DIMENSIONS, | ||||||
|     MEASURES, |     MEASURES, | ||||||
|  |     NAME, | ||||||
| ) | ) | ||||||
| from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile | from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile | ||||||
| from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | ||||||
| from datahub.ingestion.source.looker.lookml_config import ( | from datahub.ingestion.source.looker.lookml_config import ( | ||||||
|     DERIVED_VIEW_SUFFIX, |     DERIVED_VIEW_SUFFIX, | ||||||
|     NAME, |  | ||||||
|     LookMLSourceReport, |     LookMLSourceReport, | ||||||
| ) | ) | ||||||
| from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver | from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver | ||||||
|  | |||||||
| @ -28,11 +28,10 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( | |||||||
|     StatefulIngestionConfigBase, |     StatefulIngestionConfigBase, | ||||||
| ) | ) | ||||||
| from datahub.utilities.lossy_collections import LossyList | from datahub.utilities.lossy_collections import LossyList | ||||||
|  | from datahub.utilities.stats_collections import TopKDict, float_top_k_dict | ||||||
| 
 | 
 | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
| NAME: str = "name" |  | ||||||
| 
 |  | ||||||
| BASE_PROJECT_NAME = "__BASE" | BASE_PROJECT_NAME = "__BASE" | ||||||
| 
 | 
 | ||||||
| EXPLORE_FILE_EXTENSION = ".explore.lkml" | EXPLORE_FILE_EXTENSION = ".explore.lkml" | ||||||
| @ -47,6 +46,9 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}" | |||||||
| @dataclass | @dataclass | ||||||
| class LookMLSourceReport(StaleEntityRemovalSourceReport): | class LookMLSourceReport(StaleEntityRemovalSourceReport): | ||||||
|     git_clone_latency: Optional[timedelta] = None |     git_clone_latency: Optional[timedelta] = None | ||||||
|  |     looker_query_api_latency_seconds: TopKDict[str, float] = dataclass_field( | ||||||
|  |         default_factory=float_top_k_dict | ||||||
|  |     ) | ||||||
|     models_discovered: int = 0 |     models_discovered: int = 0 | ||||||
|     models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList) |     models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList) | ||||||
|     views_discovered: int = 0 |     views_discovered: int = 0 | ||||||
| @ -81,6 +83,11 @@ class LookMLSourceReport(StaleEntityRemovalSourceReport): | |||||||
|             self.api_stats = self._looker_api.compute_stats() |             self.api_stats = self._looker_api.compute_stats() | ||||||
|         return super().compute_stats() |         return super().compute_stats() | ||||||
| 
 | 
 | ||||||
|  |     def report_looker_query_api_latency( | ||||||
|  |         self, view_urn: str, latency: timedelta | ||||||
|  |     ) -> None: | ||||||
|  |         self.looker_query_api_latency_seconds[view_urn] = latency.total_seconds() | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class LookMLSourceConfig( | class LookMLSourceConfig( | ||||||
|     LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin |     LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin | ||||||
| @ -122,6 +129,16 @@ class LookMLSourceConfig( | |||||||
|         description="List of regex patterns for LookML views to include in the extraction.", |         description="List of regex patterns for LookML views to include in the extraction.", | ||||||
|     ) |     ) | ||||||
|     parse_table_names_from_sql: bool = Field(True, description="See note below.") |     parse_table_names_from_sql: bool = Field(True, description="See note below.") | ||||||
|  |     use_api_for_view_lineage: bool = Field( | ||||||
|  |         False, | ||||||
|  |         description="When enabled, uses Looker API to get SQL representation of views for lineage parsing instead of parsing LookML files directly. Requires 'api' configuration to be provided." | ||||||
|  |         "Coverage of regex based lineage extraction has limitations, it only supportes ${TABLE}.column_name syntax, See (https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions) to" | ||||||
|  |         "understand the other substitutions and cross-references allowed in LookML.", | ||||||
|  |     ) | ||||||
|  |     use_api_cache_for_view_lineage: bool = Field( | ||||||
|  |         False, | ||||||
|  |         description="When enabled, uses Looker API server-side caching for query execution. Requires 'api' configuration to be provided.", | ||||||
|  |     ) | ||||||
|     api: Optional[LookerAPIConfig] = None |     api: Optional[LookerAPIConfig] = None | ||||||
|     project_name: Optional[str] = Field( |     project_name: Optional[str] = Field( | ||||||
|         None, |         None, | ||||||
| @ -239,6 +256,17 @@ class LookMLSourceConfig( | |||||||
|             ) |             ) | ||||||
|         return values |         return values | ||||||
| 
 | 
 | ||||||
|  |     @root_validator(skip_on_failure=True) | ||||||
|  |     def check_api_provided_for_view_lineage(cls, values): | ||||||
|  |         """Validate that we must have an api credential to use Looker API for view's column lineage""" | ||||||
|  |         if not values.get("api") and values.get("use_api_for_view_lineage"): | ||||||
|  |             raise ValueError( | ||||||
|  |                 "API credential was not found. LookML source requires api credentials " | ||||||
|  |                 "for Looker to use Looker APIs for view's column lineage extraction." | ||||||
|  |                 "Set `use_api_for_view_lineage` to False to skip using Looker APIs." | ||||||
|  |             ) | ||||||
|  |         return values | ||||||
|  | 
 | ||||||
|     @validator("base_folder", always=True) |     @validator("base_folder", always=True) | ||||||
|     def check_base_folder_if_not_provided( |     def check_base_folder_if_not_provided( | ||||||
|         cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] |         cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] | ||||||
|  | |||||||
| @ -4,10 +4,10 @@ import logging | |||||||
| from typing import ClassVar, Dict, List, Set | from typing import ClassVar, Dict, List, Set | ||||||
| 
 | 
 | ||||||
| from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | ||||||
|  | from datahub.ingestion.source.looker.looker_constant import NAME | ||||||
| from datahub.ingestion.source.looker.looker_dataclasses import LookerModel | from datahub.ingestion.source.looker.looker_dataclasses import LookerModel | ||||||
| from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader | ||||||
| from datahub.ingestion.source.looker.lookml_config import ( | from datahub.ingestion.source.looker.lookml_config import ( | ||||||
|     NAME, |  | ||||||
|     LookMLSourceConfig, |     LookMLSourceConfig, | ||||||
|     LookMLSourceReport, |     LookMLSourceReport, | ||||||
| ) | ) | ||||||
|  | |||||||
| @ -142,6 +142,8 @@ class LookerView: | |||||||
|         ctx: PipelineContext, |         ctx: PipelineContext, | ||||||
|         extract_col_level_lineage: bool = False, |         extract_col_level_lineage: bool = False, | ||||||
|         populate_sql_logic_in_descriptions: bool = False, |         populate_sql_logic_in_descriptions: bool = False, | ||||||
|  |         looker_client: Optional[LookerAPI] = None, | ||||||
|  |         view_to_explore_map: Optional[Dict[str, str]] = None, | ||||||
|     ) -> Optional["LookerView"]: |     ) -> Optional["LookerView"]: | ||||||
|         view_name = view_context.name() |         view_name = view_context.name() | ||||||
| 
 | 
 | ||||||
| @ -160,6 +162,8 @@ class LookerView: | |||||||
|             config=config, |             config=config, | ||||||
|             ctx=ctx, |             ctx=ctx, | ||||||
|             reporter=reporter, |             reporter=reporter, | ||||||
|  |             looker_client=looker_client, | ||||||
|  |             view_to_explore_map=view_to_explore_map, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         field_type_vs_raw_fields = OrderedDict( |         field_type_vs_raw_fields = OrderedDict( | ||||||
| @ -705,6 +709,11 @@ class LookMLSource(StatefulIngestionSourceBase): | |||||||
|         # Value: Tuple(model file name, connection name) |         # Value: Tuple(model file name, connection name) | ||||||
|         view_connection_map: Dict[str, Tuple[str, str]] = {} |         view_connection_map: Dict[str, Tuple[str, str]] = {} | ||||||
| 
 | 
 | ||||||
|  |         # Map of view name to explore name for API-based view lineage | ||||||
|  |         # A view can be referenced by multiple explores, we only need one of the explores to use Looker Query API | ||||||
|  |         # Key: view_name, Value: explore_name | ||||||
|  |         view_to_explore_map: Dict[str, str] = {} | ||||||
|  | 
 | ||||||
|         # The ** means "this directory and all subdirectories", and hence should |         # The ** means "this directory and all subdirectories", and hence should | ||||||
|         # include all the files we want. |         # include all the files we want. | ||||||
|         model_files = sorted( |         model_files = sorted( | ||||||
| @ -759,37 +768,37 @@ class LookMLSource(StatefulIngestionSourceBase): | |||||||
|                 ) |                 ) | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             if self.source_config.emit_reachable_views_only: |             model_explores_map = {d["name"]: d for d in model.explores} | ||||||
|                 model_explores_map = {d["name"]: d for d in model.explores} |             for explore_dict in model.explores: | ||||||
|                 for explore_dict in model.explores: |                 try: | ||||||
|                     try: |                     if LookerRefinementResolver.is_refinement(explore_dict["name"]): | ||||||
|                         if LookerRefinementResolver.is_refinement(explore_dict["name"]): |                         continue | ||||||
|                             continue |  | ||||||
| 
 | 
 | ||||||
|                         explore_dict = ( |                     explore_dict = looker_refinement_resolver.apply_explore_refinement( | ||||||
|                             looker_refinement_resolver.apply_explore_refinement( |                         explore_dict | ||||||
|                                 explore_dict |                     ) | ||||||
|                             ) |                     explore: LookerExplore = LookerExplore.from_dict( | ||||||
|                         ) |                         model_name, | ||||||
|                         explore: LookerExplore = LookerExplore.from_dict( |                         explore_dict, | ||||||
|                             model_name, |                         model.resolved_includes, | ||||||
|                             explore_dict, |                         viewfile_loader, | ||||||
|                             model.resolved_includes, |                         self.reporter, | ||||||
|                             viewfile_loader, |                         model_explores_map, | ||||||
|                             self.reporter, |                     ) | ||||||
|                             model_explores_map, |                     if explore.upstream_views: | ||||||
|                         ) |                         for view_name in explore.upstream_views: | ||||||
|                         if explore.upstream_views: |                             if self.source_config.emit_reachable_views_only: | ||||||
|                             for view_name in explore.upstream_views: |  | ||||||
|                                 explore_reachable_views.add(view_name.include) |                                 explore_reachable_views.add(view_name.include) | ||||||
|                     except Exception as e: |                             # Build view to explore mapping for API-based view lineage | ||||||
|                         self.reporter.report_warning( |                             view_to_explore_map[view_name.include] = explore.name | ||||||
|                             title="Failed to process explores", |                 except Exception as e: | ||||||
|                             message="Failed to process explore dictionary.", |                     self.reporter.report_warning( | ||||||
|                             context=f"Explore Details: {explore_dict}", |                         title="Failed to process explores", | ||||||
|                             exc=e, |                         message="Failed to process explore dictionary.", | ||||||
|                         ) |                         context=f"Explore Details: {explore_dict}", | ||||||
|                         logger.debug("Failed to process explore", exc_info=e) |                         exc=e, | ||||||
|  |                     ) | ||||||
|  |                     logger.debug("Failed to process explore", exc_info=e) | ||||||
| 
 | 
 | ||||||
|             processed_view_files = processed_view_map.setdefault( |             processed_view_files = processed_view_map.setdefault( | ||||||
|                 model.connection, set() |                 model.connection, set() | ||||||
| @ -878,6 +887,10 @@ class LookMLSource(StatefulIngestionSourceBase): | |||||||
|                                 populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions, |                                 populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions, | ||||||
|                                 config=self.source_config, |                                 config=self.source_config, | ||||||
|                                 ctx=self.ctx, |                                 ctx=self.ctx, | ||||||
|  |                                 looker_client=self.looker_client, | ||||||
|  |                                 view_to_explore_map=view_to_explore_map | ||||||
|  |                                 if view_to_explore_map | ||||||
|  |                                 else None, | ||||||
|                             ) |                             ) | ||||||
|                         except Exception as e: |                         except Exception as e: | ||||||
|                             self.reporter.report_warning( |                             self.reporter.report_warning( | ||||||
|  | |||||||
| @ -1,18 +1,33 @@ | |||||||
| import logging | import logging | ||||||
| import re | import re | ||||||
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||||
|  | from datetime import datetime | ||||||
| from functools import lru_cache | from functools import lru_cache | ||||||
| from typing import Dict, List, Optional | from typing import Dict, List, Optional | ||||||
| 
 | 
 | ||||||
|  | from looker_sdk.sdk.api40.models import ( | ||||||
|  |     WriteQuery, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance | from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance | ||||||
| from datahub.ingestion.api.common import PipelineContext | from datahub.ingestion.api.common import PipelineContext | ||||||
| from datahub.ingestion.source.looker.looker_common import ( | from datahub.ingestion.source.looker.looker_common import ( | ||||||
|     LookerExplore, |     LookerExplore, | ||||||
|     LookerViewId, |     LookerViewId, | ||||||
|     ViewField, |     ViewField, | ||||||
|  |     ViewFieldDimensionGroupType, | ||||||
|     ViewFieldType, |     ViewFieldType, | ||||||
| ) | ) | ||||||
| from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition | ||||||
|  | from datahub.ingestion.source.looker.looker_constant import ( | ||||||
|  |     NAME, | ||||||
|  |     VIEW_FIELD_INTERVALS_ATTRIBUTE, | ||||||
|  |     VIEW_FIELD_TIMEFRAMES_ATTRIBUTE, | ||||||
|  |     VIEW_FIELD_TYPE_ATTRIBUTE, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source.looker.looker_lib_wrapper import ( | ||||||
|  |     LookerAPI, | ||||||
|  | ) | ||||||
| from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache | from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache | ||||||
| from datahub.ingestion.source.looker.lookml_concept_context import ( | from datahub.ingestion.source.looker.lookml_concept_context import ( | ||||||
|     LookerFieldContext, |     LookerFieldContext, | ||||||
| @ -20,7 +35,6 @@ from datahub.ingestion.source.looker.lookml_concept_context import ( | |||||||
| ) | ) | ||||||
| from datahub.ingestion.source.looker.lookml_config import ( | from datahub.ingestion.source.looker.lookml_config import ( | ||||||
|     DERIVED_VIEW_SUFFIX, |     DERIVED_VIEW_SUFFIX, | ||||||
|     NAME, |  | ||||||
|     LookMLSourceConfig, |     LookMLSourceConfig, | ||||||
|     LookMLSourceReport, |     LookMLSourceReport, | ||||||
| ) | ) | ||||||
| @ -280,6 +294,447 @@ class AbstractViewUpstream(ABC): | |||||||
|         return upstream_column_refs |         return upstream_column_refs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class LookerQueryAPIBasedViewUpstream(AbstractViewUpstream): | ||||||
|  |     """ | ||||||
|  |     Implements Looker view upstream lineage extraction using the Looker Query API. | ||||||
|  | 
 | ||||||
|  |     This class leverages the Looker API to generate the fully resolved SQL for a Looker view by constructing a WriteQuery | ||||||
|  |     that includes all dimensions, dimension groups and measures. The SQL is then parsed to extract column-level lineage. | ||||||
|  |     The Looker client is required for this class, as it is used to execute the WriteQuery and retrieve the SQL. | ||||||
|  | 
 | ||||||
|  |     Other view upstream implementations use string parsing to extract lineage information from the SQL, which does not cover all the edge cases. | ||||||
|  |     Limitations of string based lineage extraction: Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions | ||||||
|  | 
 | ||||||
|  |     Key Features: | ||||||
|  |     - Requires a Looker client (`looker_client`) to execute queries and retrieve SQL for the view. | ||||||
|  |     - Requires a `view_to_explore_map` to map view names to their corresponding explore name | ||||||
|  |     - Field name translation is handled: Looker API field names are constructed as `<view_name>.<field_name>`, and helper | ||||||
|  |       methods are provided to convert between Looker API field names and raw field names. | ||||||
|  |     - SQL parsing is cached for efficiency, and the class is designed to gracefully fall back if the Looker Query API fails. | ||||||
|  |     - All lineage extraction is based on the SQL returned by the Looker API, ensuring accurate and up-to-date lineage. | ||||||
|  | 
 | ||||||
|  |     Why view_to_explore_map is required: | ||||||
|  |     The Looker Query API expects the explore name (not the view name) as the "view" parameter in the WriteQuery. | ||||||
|  |     In Looker, a view can be referenced by multiple explores, but the API needs any one of the | ||||||
|  |     explores to access the view's fields | ||||||
|  | 
 | ||||||
|  |     Example WriteQuery request (see `_execute_query` for details): | ||||||
|  |         { | ||||||
|  |             "model": "test_model", | ||||||
|  |             "view": "users_explore",  # This is the explore name, not the view name | ||||||
|  |             "fields": [ | ||||||
|  |                 "users.email", "users.lifetime_purchase_count" | ||||||
|  |             ], | ||||||
|  |             "limit": "1", | ||||||
|  |             "cache": true | ||||||
|  |         } | ||||||
|  |     The SQL response is then parsed to extract upstream tables and column-level lineage. | ||||||
|  | 
 | ||||||
|  |     For further details, see the method-level docstrings, especially: | ||||||
|  |       - `__get_spr`: SQL parsing and lineage extraction workflow | ||||||
|  |       - `_get_sql_write_query`: WriteQuery construction and field enumeration | ||||||
|  |       - `_execute_query`: Looker API invocation and SQL retrieval - this only generates the SQL query, does not execute it | ||||||
|  |       - Field name translation: `_get_looker_api_field_name` and `_get_field_name_from_looker_api_field_name` | ||||||
|  | 
 | ||||||
|  |     Note: This class is intended to be robust and raise exceptions if SQL parsing or API calls fail, and will fall back to | ||||||
|  |     other implementations - custom regex-based parsing if necessary. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         view_context: LookerViewContext, | ||||||
|  |         looker_view_id_cache: LookerViewIdCache, | ||||||
|  |         config: LookMLSourceConfig, | ||||||
|  |         reporter: LookMLSourceReport, | ||||||
|  |         ctx: PipelineContext, | ||||||
|  |         looker_client: LookerAPI, | ||||||
|  |         view_to_explore_map: Dict[str, str], | ||||||
|  |     ): | ||||||
|  |         super().__init__(view_context, looker_view_id_cache, config, reporter, ctx) | ||||||
|  |         self.looker_client = looker_client | ||||||
|  |         self.view_to_explore_map = view_to_explore_map | ||||||
|  |         # Cache the SQL parsing results | ||||||
|  |         # We use maxsize=1 because a new class instance is created for each view, Ref: view_upstream.create_view_upstream | ||||||
|  |         self._get_spr = lru_cache(maxsize=1)(self.__get_spr) | ||||||
|  |         self._get_upstream_dataset_urn = lru_cache(maxsize=1)( | ||||||
|  |             self.__get_upstream_dataset_urn | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Initialize the cache | ||||||
|  |         # Done to fallback to other implementations if the Looker Query API fails | ||||||
|  |         self._get_spr() | ||||||
|  | 
 | ||||||
|  |     def __get_spr(self) -> SqlParsingResult: | ||||||
|  |         """ | ||||||
|  |         Retrieves the SQL parsing result for the current Looker view by: | ||||||
|  |         1. Building a WriteQuery for the view. | ||||||
|  |         2. Executing the query via the Looker API to get the SQL. | ||||||
|  |         3. Parsing the SQL to extract lineage information. | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             SqlParsingResult if successful, otherwise None. | ||||||
|  |         Raises: | ||||||
|  |             ValueError: If no SQL is found in the response. | ||||||
|  |             ValueError: If no fields are found for the view. | ||||||
|  |             ValueError: If explore name is not found for the view. | ||||||
|  |             ValueError: If error in parsing SQL for upstream tables. | ||||||
|  |             ValueError: If error in parsing SQL for column lineage. | ||||||
|  |         """ | ||||||
|  |         try: | ||||||
|  |             # Build the WriteQuery for the current view. | ||||||
|  |             sql_query: WriteQuery = self._get_sql_write_query() | ||||||
|  | 
 | ||||||
|  |             # Execute the query to get the SQL representation from Looker. | ||||||
|  |             sql_response = self._execute_query(sql_query) | ||||||
|  | 
 | ||||||
|  |             # Parse the SQL to extract lineage information. | ||||||
|  |             spr = create_lineage_sql_parsed_result( | ||||||
|  |                 query=sql_response, | ||||||
|  |                 default_schema=self.view_context.view_connection.default_schema, | ||||||
|  |                 default_db=self.view_context.view_connection.default_db, | ||||||
|  |                 platform=self.view_context.view_connection.platform, | ||||||
|  |                 platform_instance=self.view_context.view_connection.platform_instance, | ||||||
|  |                 env=self.view_context.view_connection.platform_env or self.config.env, | ||||||
|  |                 graph=self.ctx.graph, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             # Check for errors encountered during table extraction. | ||||||
|  |             table_error = spr.debug_info.table_error | ||||||
|  |             if table_error is not None: | ||||||
|  |                 self.reporter.report_warning( | ||||||
|  |                     title="Table Level Lineage Extraction Failed", | ||||||
|  |                     message="Error in parsing derived sql", | ||||||
|  |                     context=f"View-name: {self.view_context.name()}", | ||||||
|  |                     exc=table_error, | ||||||
|  |                 ) | ||||||
|  |                 raise ValueError( | ||||||
|  |                     f"Error in parsing SQL for upstream tables: {table_error}" | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |             column_error = spr.debug_info.column_error | ||||||
|  |             if column_error is not None: | ||||||
|  |                 self.reporter.report_warning( | ||||||
|  |                     title="Column Level Lineage Extraction Failed", | ||||||
|  |                     message="Error in parsing derived sql", | ||||||
|  |                     context=f"View-name: {self.view_context.name()}", | ||||||
|  |                     exc=column_error, | ||||||
|  |                 ) | ||||||
|  |                 raise ValueError( | ||||||
|  |                     f"Error in parsing SQL for column lineage: {column_error}" | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |             return spr | ||||||
|  |         except Exception: | ||||||
|  |             # Reraise the exception to allow higher-level handling. | ||||||
|  |             raise | ||||||
|  | 
 | ||||||
|  |     def _get_time_dim_group_field_name(self, dim_group: dict) -> str: | ||||||
|  |         """ | ||||||
|  |         Time dimension groups must be referenced by their individual timeframes suffix. | ||||||
|  |         Example: | ||||||
|  |             dimension_group: created { | ||||||
|  |                 type: time | ||||||
|  |                 timeframes: [date, week, month] | ||||||
|  |                 sql: ${TABLE}.created_at ;; | ||||||
|  |             } | ||||||
|  |         Used as: {view_name.date_created} | ||||||
|  | 
 | ||||||
|  |         created -> created_date, created_week, created_month | ||||||
|  |         # Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#dimension_groups_must_be_referenced_by_their_individual_dimensions | ||||||
|  |         """ | ||||||
|  |         dim_group_name = dim_group.get(NAME) | ||||||
|  |         timeframes = dim_group.get(VIEW_FIELD_TIMEFRAMES_ATTRIBUTE) | ||||||
|  | 
 | ||||||
|  |         # If timeframes is not included (rare case), the dimension group will include all possible timeframes. | ||||||
|  |         # We will pick to use "raw" | ||||||
|  |         suffix = timeframes[0] if timeframes else "raw" | ||||||
|  |         return f"{dim_group_name}_{suffix}" | ||||||
|  | 
 | ||||||
|  |     def _get_duration_dim_group_field_name(self, dim_group: dict) -> str: | ||||||
|  |         """ | ||||||
|  |         Duration dimension groups must be referenced by their plural version of the interval value as prefix | ||||||
|  |         Example: | ||||||
|  |             dimension_group: since_event { | ||||||
|  |                 type: duration | ||||||
|  |                 intervals: [hour, day, week, month, quarter, year] | ||||||
|  |                 sql_start: ${faa_event_date_raw} ;; | ||||||
|  |                 sql_end: CURRENT_TIMESTAMP();; | ||||||
|  |             } | ||||||
|  |         Used as: {view_name.hours_since_event} | ||||||
|  | 
 | ||||||
|  |         since_event -> hours_since_event, days_since_event, weeks_since_event, months_since_event, quarters_since_event, years_since_event | ||||||
|  |         # Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#referencing_intervals_from_another_lookml_field | ||||||
|  |         """ | ||||||
|  |         dim_group_name = dim_group.get(NAME) | ||||||
|  |         intervals = dim_group.get(VIEW_FIELD_INTERVALS_ATTRIBUTE) | ||||||
|  | 
 | ||||||
|  |         # If intervals is not included (rare case), the dimension group will include all possible intervals. | ||||||
|  |         # We will pick to use "day" -> "days" | ||||||
|  |         prefix = f"{intervals[0]}s" if intervals else "days" | ||||||
|  |         return f"{prefix}_{dim_group_name}" | ||||||
|  | 
 | ||||||
|  |     def _get_sql_write_query(self) -> WriteQuery: | ||||||
|  |         """ | ||||||
|  |         Constructs a WriteQuery object to obtain the SQL representation of the current Looker view. | ||||||
|  | 
 | ||||||
|  |         We need to list all the fields for the view to get the SQL representation of the view - this fully resolved SQL for view dimensions and measures. | ||||||
|  | 
 | ||||||
|  |         The method uses the view_to_explore_map to determine the correct explore name to use in the WriteQuery. | ||||||
|  |         This is crucial because the Looker Query API expects the explore name (not the view name) as the "view" parameter. | ||||||
|  | 
 | ||||||
|  |         Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             WriteQuery: The WriteQuery object if fields are found and explore name is available, otherwise None. | ||||||
|  | 
 | ||||||
|  |         Raises: | ||||||
|  |             ValueError: If the explore name is not found in the view_to_explore_map for the current view. | ||||||
|  |             ValueError: If no fields are found for the view. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         # Collect all dimension and measure fields for the view. | ||||||
|  |         view_fields: List[str] = [] | ||||||
|  |         # Add dimension fields in the format: <view_name>.<dimension_name> or <view_name>.<measure_name> | ||||||
|  |         for field in self.view_context.dimensions() + self.view_context.measures(): | ||||||
|  |             field_name = field.get(NAME) | ||||||
|  |             assert field_name  # Happy linter | ||||||
|  |             view_fields.append(self._get_looker_api_field_name(field_name)) | ||||||
|  | 
 | ||||||
|  |         for dim_group in self.view_context.dimension_groups(): | ||||||
|  |             dim_group_type: ViewFieldDimensionGroupType = ViewFieldDimensionGroupType( | ||||||
|  |                 dim_group.get(VIEW_FIELD_TYPE_ATTRIBUTE) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             if dim_group_type == ViewFieldDimensionGroupType.TIME: | ||||||
|  |                 view_fields.append( | ||||||
|  |                     self._get_looker_api_field_name( | ||||||
|  |                         self._get_time_dim_group_field_name(dim_group) | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  |             elif dim_group_type == ViewFieldDimensionGroupType.DURATION: | ||||||
|  |                 view_fields.append( | ||||||
|  |                     self._get_looker_api_field_name( | ||||||
|  |                         self._get_duration_dim_group_field_name(dim_group) | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         # Use explore name from view_to_explore_map if available | ||||||
|  |         # explore_name is always present in the view_to_explore_map because of the check in view_upstream.create_view_upstream | ||||||
|  |         explore_name = self.view_to_explore_map.get(self.view_context.name()) | ||||||
|  |         assert explore_name  # Happy linter | ||||||
|  | 
 | ||||||
|  |         if not view_fields: | ||||||
|  |             raise ValueError( | ||||||
|  |                 f"No fields found for view '{self.view_context.name()}'. Cannot proceed with Looker API for view lineage." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # Construct and return the WriteQuery object. | ||||||
|  |         # The 'limit' is set to "1" as the query is only used to obtain SQL, not to fetch data. | ||||||
|  |         return WriteQuery( | ||||||
|  |             model=self.looker_view_id_cache.model_name, | ||||||
|  |             view=explore_name, | ||||||
|  |             fields=view_fields, | ||||||
|  |             filters={}, | ||||||
|  |             limit="1", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def _execute_query(self, query: WriteQuery) -> str: | ||||||
|  |         """ | ||||||
|  |         Executes a Looker SQL query using the Looker API and returns the SQL string. | ||||||
|  | 
 | ||||||
|  |         Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query | ||||||
|  | 
 | ||||||
|  |         Example Request: | ||||||
|  |             WriteQuery: | ||||||
|  |                 { | ||||||
|  |                     "model": "test_model", | ||||||
|  |                     "view": "users", | ||||||
|  |                     "fields": [ | ||||||
|  |                         "users.email", "users.lifetime_purchase_count" | ||||||
|  |                     ], | ||||||
|  |                     "limit": "1", | ||||||
|  |                     "cache": true | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |             Response: | ||||||
|  |                 " | ||||||
|  |                 SELECT | ||||||
|  |                     users."EMAIL"  AS "users.email", | ||||||
|  |                     COUNT(DISTINCT ( purchases."PK"  ) ) AS "users.lifetime_purchase_count" | ||||||
|  |                 FROM "ECOMMERCE"."USERS"  AS users | ||||||
|  |                 LEFT JOIN "ECOMMERCE"."PURCHASES"  AS purchases ON (users."PK") = (purchases."USER_FK") | ||||||
|  |                 GROUP BY | ||||||
|  |                     1 | ||||||
|  |                 ORDER BY | ||||||
|  |                     2 DESC | ||||||
|  |                 FETCH NEXT 1 ROWS ONLY | ||||||
|  |                 " | ||||||
|  |         Args: | ||||||
|  |             query (WriteQuery): The Looker WriteQuery object to execute. | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             str: The SQL string returned by the Looker API, or an empty string if execution fails. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         # Record the start time for latency measurement. | ||||||
|  |         start_time = datetime.now() | ||||||
|  | 
 | ||||||
|  |         # Execute the query using the Looker client. | ||||||
|  |         sql_response = self.looker_client.generate_sql_query( | ||||||
|  |             write_query=query, use_cache=self.config.use_api_cache_for_view_lineage | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Record the end time after query execution. | ||||||
|  |         end_time = datetime.now() | ||||||
|  | 
 | ||||||
|  |         # Attempt to get the LookerViewId for reporting. | ||||||
|  |         looker_view_id: Optional[LookerViewId] = ( | ||||||
|  |             self.looker_view_id_cache.get_looker_view_id( | ||||||
|  |                 view_name=self.view_context.name(), | ||||||
|  |                 base_folder_path=self.view_context.base_folder_path, | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Report the query API latency if the view ID is available. | ||||||
|  |         if looker_view_id is not None: | ||||||
|  |             self.reporter.report_looker_query_api_latency( | ||||||
|  |                 looker_view_id.get_urn(self.config), | ||||||
|  |                 end_time - start_time, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # Validate the response structure. | ||||||
|  |         if not sql_response: | ||||||
|  |             raise ValueError( | ||||||
|  |                 f"No SQL found in response for view '{self.view_context.name()}'. Response: {sql_response}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # Extract the SQL string from the response. | ||||||
|  |         return sql_response | ||||||
|  | 
 | ||||||
|  |     def __get_upstream_dataset_urn(self) -> List[Urn]: | ||||||
|  |         """ | ||||||
|  |         Extract upstream dataset URNs by parsing the SQL for the current view. | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             List[Urn]: List of upstream dataset URNs, or an empty list if parsing fails. | ||||||
|  |         """ | ||||||
|  |         # Attempt to get the SQL parsing result for the current view. | ||||||
|  |         spr: SqlParsingResult = self._get_spr() | ||||||
|  | 
 | ||||||
|  |         # Remove any 'hive.' prefix from upstream table URNs. | ||||||
|  |         upstream_dataset_urns: List[str] = [ | ||||||
|  |             _drop_hive_dot(urn) for urn in spr.in_tables | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         # Fix any derived view references present in the URNs. | ||||||
|  |         upstream_dataset_urns = fix_derived_view_urn( | ||||||
|  |             urns=upstream_dataset_urns, | ||||||
|  |             looker_view_id_cache=self.looker_view_id_cache, | ||||||
|  |             base_folder_path=self.view_context.base_folder_path, | ||||||
|  |             config=self.config, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         return upstream_dataset_urns | ||||||
|  | 
 | ||||||
|  |     def _get_looker_api_field_name(self, field_name: str) -> str: | ||||||
|  |         """ | ||||||
|  |         Translate the field name to the looker api field name | ||||||
|  | 
 | ||||||
|  |         Example: | ||||||
|  |             pk -> purchases.pk | ||||||
|  |         """ | ||||||
|  |         return f"{self.view_context.name()}.{field_name}" | ||||||
|  | 
 | ||||||
|  |     def _get_field_name_from_looker_api_field_name( | ||||||
|  |         self, looker_api_field_name: str | ||||||
|  |     ) -> str: | ||||||
|  |         """ | ||||||
|  |         Translate the looker api field name to the field name | ||||||
|  | 
 | ||||||
|  |         Example: | ||||||
|  |             purchases.pk -> pk | ||||||
|  |         """ | ||||||
|  |         # Remove the view name at the start and the dot from the looker_api_field_name, but only if it matches the current view name | ||||||
|  |         prefix = f"{self.view_context.name()}." | ||||||
|  |         if looker_api_field_name.startswith(prefix): | ||||||
|  |             return looker_api_field_name[len(prefix) :] | ||||||
|  |         else: | ||||||
|  |             # Don't throw an error, just return the original field name | ||||||
|  |             return looker_api_field_name | ||||||
|  | 
 | ||||||
|  |     def get_upstream_dataset_urn(self) -> List[Urn]: | ||||||
|  |         """Get upstream dataset URNs""" | ||||||
|  |         return self._get_upstream_dataset_urn() | ||||||
|  | 
 | ||||||
|  |     def get_upstream_column_ref( | ||||||
|  |         self, field_context: LookerFieldContext | ||||||
|  |     ) -> List[ColumnRef]: | ||||||
|  |         """Return upstream column references for a given field.""" | ||||||
|  |         spr: SqlParsingResult = self._get_spr() | ||||||
|  |         if not spr.column_lineage: | ||||||
|  |             return [] | ||||||
|  | 
 | ||||||
|  |         field_type: Optional[ViewFieldDimensionGroupType] = None | ||||||
|  |         field_name = field_context.name() | ||||||
|  |         try: | ||||||
|  |             # Try if field is a dimension group | ||||||
|  |             field_type = ViewFieldDimensionGroupType( | ||||||
|  |                 field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             if field_type == ViewFieldDimensionGroupType.TIME: | ||||||
|  |                 field_name = self._get_time_dim_group_field_name( | ||||||
|  |                     field_context.raw_field | ||||||
|  |                 ) | ||||||
|  |             elif field_type == ViewFieldDimensionGroupType.DURATION: | ||||||
|  |                 field_name = self._get_duration_dim_group_field_name( | ||||||
|  |                     field_context.raw_field | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         except Exception: | ||||||
|  |             # Not a dimension group, no modification needed | ||||||
|  |             logger.debug( | ||||||
|  |                 f"view-name={self.view_context.name()}, field-name={field_name}, field-type={field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE)}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         field_api_name = self._get_looker_api_field_name(field_name).lower() | ||||||
|  | 
 | ||||||
|  |         upstream_refs: List[ColumnRef] = [] | ||||||
|  | 
 | ||||||
|  |         for lineage in spr.column_lineage: | ||||||
|  |             if lineage.downstream.column.lower() == field_api_name: | ||||||
|  |                 for upstream in lineage.upstreams: | ||||||
|  |                     upstream_refs.append( | ||||||
|  |                         ColumnRef(table=upstream.table, column=upstream.column) | ||||||
|  |                     ) | ||||||
|  | 
 | ||||||
|  |         return _drop_hive_dot_from_upstream(upstream_refs) | ||||||
|  | 
 | ||||||
|  |     def create_fields(self) -> List[ViewField]: | ||||||
|  |         """Create ViewField objects from SQL parsing result.""" | ||||||
|  |         spr: SqlParsingResult = self._get_spr() | ||||||
|  | 
 | ||||||
|  |         if not spr.column_lineage: | ||||||
|  |             return [] | ||||||
|  | 
 | ||||||
|  |         fields: List[ViewField] = [] | ||||||
|  | 
 | ||||||
|  |         for lineage in spr.column_lineage: | ||||||
|  |             fields.append( | ||||||
|  |                 ViewField( | ||||||
|  |                     name=self._get_field_name_from_looker_api_field_name( | ||||||
|  |                         lineage.downstream.column | ||||||
|  |                     ), | ||||||
|  |                     label="", | ||||||
|  |                     type=lineage.downstream.native_column_type or "unknown", | ||||||
|  |                     description="", | ||||||
|  |                     field_type=ViewFieldType.UNKNOWN, | ||||||
|  |                     upstream_fields=_drop_hive_dot_from_upstream(lineage.upstreams), | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |         return fields | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC): | class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC): | ||||||
|     """ |     """ | ||||||
|     Handle the case where upstream dataset is defined in derived_table.sql |     Handle the case where upstream dataset is defined in derived_table.sql | ||||||
| @ -674,7 +1129,45 @@ def create_view_upstream( | |||||||
|     config: LookMLSourceConfig, |     config: LookMLSourceConfig, | ||||||
|     ctx: PipelineContext, |     ctx: PipelineContext, | ||||||
|     reporter: LookMLSourceReport, |     reporter: LookMLSourceReport, | ||||||
|  |     looker_client: Optional["LookerAPI"] = None, | ||||||
|  |     view_to_explore_map: Optional[Dict[str, str]] = None, | ||||||
| ) -> AbstractViewUpstream: | ) -> AbstractViewUpstream: | ||||||
|  |     # Looker client is required for LookerQueryAPIBasedViewUpstream also enforced by config.use_api_for_view_lineage | ||||||
|  |     # view_to_explore_map is required for Looker query API args | ||||||
|  |     # Only process if view exists in view_to_explore_map, because we cannot query views which are not reachable from an explore | ||||||
|  |     if ( | ||||||
|  |         config.use_api_for_view_lineage | ||||||
|  |         and looker_client | ||||||
|  |         and view_to_explore_map | ||||||
|  |         and view_context.name() in view_to_explore_map | ||||||
|  |     ): | ||||||
|  |         try: | ||||||
|  |             return LookerQueryAPIBasedViewUpstream( | ||||||
|  |                 view_context=view_context, | ||||||
|  |                 config=config, | ||||||
|  |                 reporter=reporter, | ||||||
|  |                 ctx=ctx, | ||||||
|  |                 looker_view_id_cache=looker_view_id_cache, | ||||||
|  |                 looker_client=looker_client, | ||||||
|  |                 view_to_explore_map=view_to_explore_map, | ||||||
|  |             ) | ||||||
|  |         except Exception as e: | ||||||
|  |             # Falling back to custom regex-based parsing - best effort approach | ||||||
|  |             reporter.report_warning( | ||||||
|  |                 title="Looker Query API based View Upstream Failed", | ||||||
|  |                 message="Error in getting upstream lineage for view using Looker Query API", | ||||||
|  |                 context=f"View-name: {view_context.name()}", | ||||||
|  |                 exc=e, | ||||||
|  |             ) | ||||||
|  |     else: | ||||||
|  |         logger.debug( | ||||||
|  |             f"Skipping Looker Query API for view: {view_context.name()} because one or more conditions are not met: " | ||||||
|  |             f"use_api_for_view_lineage={config.use_api_for_view_lineage}, " | ||||||
|  |             f"looker_client={'set' if looker_client else 'not set'}, " | ||||||
|  |             f"view_to_explore_map={'set' if view_to_explore_map else 'not set'}, " | ||||||
|  |             f"view_in_view_to_explore_map={view_context.name() in view_to_explore_map if view_to_explore_map else False}" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|     if view_context.is_regular_case(): |     if view_context.is_regular_case(): | ||||||
|         return RegularViewUpstream( |         return RegularViewUpstream( | ||||||
|             view_context=view_context, |             view_context=view_context, | ||||||
|  | |||||||
| @ -590,7 +590,10 @@ def setup_mock_all_user(mocked_client): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def side_effect_query_inline( | def side_effect_query_inline( | ||||||
|     result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] |     result_format: str, | ||||||
|  |     body: WriteQuery, | ||||||
|  |     transport_options: Optional[TransportOptions], | ||||||
|  |     cache: Optional[bool] = None, | ||||||
| ) -> str: | ) -> str: | ||||||
|     query_type: looker_usage.QueryId |     query_type: looker_usage.QueryId | ||||||
|     if result_format == "sql": |     if result_format == "sql": | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,83 @@ | |||||||
|  | # Define the database connection to be used for this model. | ||||||
|  | connection: "long-tail-companions-snowflake" | ||||||
|  | 
 | ||||||
|  | # include all the views | ||||||
|  | include: "/views/**/*.view.lkml" | ||||||
|  | 
 | ||||||
|  | # Datagroups define a caching policy for an Explore. To learn more, | ||||||
|  | # use the Quick Help panel on the right to see documentation. | ||||||
|  | 
 | ||||||
|  | datagroup: dev_project_default_datagroup { | ||||||
|  |   # sql_trigger: SELECT MAX(id) FROM etl_log;; | ||||||
|  |   max_cache_age: "1 hour" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | persist_with: dev_project_default_datagroup | ||||||
|  | 
 | ||||||
|  | explore: purchases { | ||||||
|  |   join: users { | ||||||
|  |     type: left_outer | ||||||
|  |     sql_on: ${purchases.user_fk} = ${users.pk} ;; | ||||||
|  |     relationship: many_to_one | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   join: user_metrics{ | ||||||
|  |     type: left_outer | ||||||
|  |     sql_on:${user_metrics.user_id} = ${users.pk} ;; | ||||||
|  |     relationship: many_to_one | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # explore: users{ | ||||||
|  | #   join: purchases { | ||||||
|  | #     type: left_outer | ||||||
|  | #     sql_on: ${users.pk} = ${purchases.user_fk} ;; | ||||||
|  | #     relationship: one_to_many | ||||||
|  | #   } | ||||||
|  | 
 | ||||||
|  | #   join: user_metrics{ | ||||||
|  | #     type: left_outer | ||||||
|  | #     sql_on:${user_metrics.user_id} = ${users.pk} ;; | ||||||
|  | #     relationship: many_to_one | ||||||
|  | #   } | ||||||
|  | # } | ||||||
|  | 
 | ||||||
|  | explore: user_metrics { | ||||||
|  |   description: "Analyze customer segments, lifetime value, and purchasing patterns" | ||||||
|  | 
 | ||||||
|  |   join: users { | ||||||
|  |     type: inner | ||||||
|  |     sql_on: ${user_metrics.user_id} = ${users.pk} ;; | ||||||
|  |     relationship: many_to_one | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   join: purchases{ | ||||||
|  |     type: left_outer | ||||||
|  |     sql_on: ${user_metrics.user_id} = ${purchases.user_fk} ;; | ||||||
|  |     relationship: one_to_many | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | explore: customer_analysis{ | ||||||
|  |   from:  users | ||||||
|  |   description: "Customer analysis and demographics" | ||||||
|  | 
 | ||||||
|  |   join: purchases { | ||||||
|  |     type: left_outer | ||||||
|  |     sql_on: ${customer_analysis.pk} = ${purchases.user_fk} ;; | ||||||
|  |     relationship: one_to_many | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   join: user_metrics{ | ||||||
|  |     type: left_outer | ||||||
|  |     sql_on:${user_metrics.user_id} = ${customer_analysis.pk} ;; | ||||||
|  |     relationship: one_to_one | ||||||
|  | 
 | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   join: users { | ||||||
|  |     type: inner | ||||||
|  |     sql_on: ${customer_analysis.pk} = ${users.pk} ;; | ||||||
|  |     relationship: one_to_one | ||||||
|  |   } | ||||||
|  | } | ||||||
| @ -0,0 +1,93 @@ | |||||||
|  | # The name of this view in Looker is "Purchases" | ||||||
|  | view: purchases { | ||||||
|  |   # The sql_table_name parameter indicates the underlying database table | ||||||
|  |   # to be used for all fields in this view. | ||||||
|  |   sql_table_name: "ECOMMERCE"."PURCHASES" ;; | ||||||
|  | 
 | ||||||
|  |   # No primary key is defined for this view. In order to join this view in an Explore, | ||||||
|  |   # define primary_key: yes on a dimension that has no repeated values. | ||||||
|  | 
 | ||||||
|  |   # Dates and timestamps can be represented in Looker using a dimension group of type: time. | ||||||
|  |   # Looker converts dates and timestamps to the specified timeframes within the dimension group. | ||||||
|  | 
 | ||||||
|  |   dimension_group: created { | ||||||
|  |     type: time | ||||||
|  |     timeframes: [raw, time, date, week, month, quarter, year] | ||||||
|  |     sql: ${TABLE}."CREATED_AT" ;; | ||||||
|  |   } | ||||||
|  |     # Here's what a typical dimension looks like in LookML. | ||||||
|  |     # A dimension is a groupable field that can be used to filter query results. | ||||||
|  |     # This dimension will be called "Pk" in Explore. | ||||||
|  | 
 | ||||||
|  |   dimension: pk { | ||||||
|  |     primary_key: yes | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."PK" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: purchase_amount { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."PURCHASE_AMOUNT" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: status { | ||||||
|  |     type: string | ||||||
|  |     sql: ${TABLE}."STATUS" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: tax_amount { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."TAX_AMOUNT" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: total_amount { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."TOTAL_AMOUNT" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension_group: updated { | ||||||
|  |     type: time | ||||||
|  |     timeframes: [raw, time, date, week, month, quarter, year] | ||||||
|  |     sql: ${TABLE}."UPDATED_AT" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: user_fk { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."USER_FK" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   # Inter View Dimension References | ||||||
|  |   dimension: is_expensive_purchase { | ||||||
|  |     type: yesno | ||||||
|  |     sql: ${total_amount} > 100 ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   # Inter View Nested Dimension References | ||||||
|  |   measure: num_of_expensive_purchases { | ||||||
|  |     type: count | ||||||
|  |     drill_fields: [is_expensive_purchase] | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   # Intra View Dimension Reference | ||||||
|  |   dimension: user_email{ | ||||||
|  |     type: string | ||||||
|  |     sql:${users.email} ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   measure: average_purchase_value{ | ||||||
|  |     type: average | ||||||
|  |     sql: ${total_amount} ;; | ||||||
|  |     value_format_name: usd | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   measure: count { | ||||||
|  |     type: count | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension_group: purchase_age { | ||||||
|  |     type: duration | ||||||
|  |     sql_start: ${TABLE}."CREATED_AT" ;; | ||||||
|  |     sql_end: CURRENT_TIMESTAMP ;; | ||||||
|  |     intervals: [day, week, month, quarter, year] | ||||||
|  |   } | ||||||
|  | } | ||||||
| @ -0,0 +1,43 @@ | |||||||
|  | view: user_metrics { | ||||||
|  |   derived_table: { | ||||||
|  |     sql: SELECT | ||||||
|  |            user_fk as user_id, | ||||||
|  |            COUNT(DISTINCT pk) as purchase_count, | ||||||
|  |            SUM(total_amount) as total_spent | ||||||
|  |          FROM ${purchases.SQL_TABLE_NAME} | ||||||
|  |          GROUP BY user_id ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: user_id { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}.user_id ;; | ||||||
|  |     primary_key: yes | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: purchase_count { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}.purchase_count ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: total_spent { | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}.total_spent ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  | # Cross-view dimension with conditional logic | ||||||
|  |   dimension: customer_segment { | ||||||
|  |     type: string | ||||||
|  |     sql: CASE | ||||||
|  |            WHEN ${total_spent} > 1000 THEN 'High Value' | ||||||
|  |            WHEN ${total_spent} > 500 THEN 'Medium Value' | ||||||
|  |            ELSE 'Low Value' | ||||||
|  |          END ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  | # Cross-view measure with filtering | ||||||
|  |   measure: high_value_customer_count { | ||||||
|  |     type: count_distinct | ||||||
|  |     sql: CASE WHEN ${total_spent} > 1000 THEN ${users.pk} END ;; | ||||||
|  |     description: "Count of customers who spent over $1000" | ||||||
|  |   } | ||||||
|  | } | ||||||
| @ -0,0 +1,73 @@ | |||||||
|  | # The name of this view in Looker is "Users" | ||||||
|  | view: users { | ||||||
|  |   # The sql_table_name parameter indicates the underlying database table | ||||||
|  |   # to be used for all fields in this view. | ||||||
|  |   sql_table_name: "ECOMMERCE"."USERS" ;; | ||||||
|  | 
 | ||||||
|  |   # No primary key is defined for this view. In order to join this view in an Explore, | ||||||
|  |   # define primary_key: yes on a dimension that has no repeated values. | ||||||
|  | 
 | ||||||
|  |   # Dates and timestamps can be represented in Looker using a dimension group of type: time. | ||||||
|  |   # Looker converts dates and timestamps to the specified timeframes within the dimension group. | ||||||
|  | 
 | ||||||
|  |   dimension_group: created { | ||||||
|  |     type: time | ||||||
|  |     timeframes: [raw, time, date, week, month, quarter, year] | ||||||
|  |     sql: ${TABLE}."CREATED_AT" ;; | ||||||
|  |   } | ||||||
|  |     # Here's what a typical dimension looks like in LookML. | ||||||
|  |     # A dimension is a groupable field that can be used to filter query results. | ||||||
|  |     # This dimension will be called "Email" in Explore. | ||||||
|  | 
 | ||||||
|  |   dimension: email { | ||||||
|  |     type: string | ||||||
|  |     sql: ${TABLE}."EMAIL" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: pk { | ||||||
|  |     primary_key: yes | ||||||
|  |     type: number | ||||||
|  |     sql: ${TABLE}."PK" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension_group: updated { | ||||||
|  |     type: time | ||||||
|  |     timeframes: [raw, time, date, week, month, quarter, year] | ||||||
|  |     sql: ${TABLE}."UPDATED_AT" ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   measure: lifetime_purchase_count{ | ||||||
|  |     type: count_distinct | ||||||
|  |     sql: ${purchases.pk} ;; | ||||||
|  |     description: "Total lifetime purchases count by user" | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   measure: lifetime_total_purchase_amount{ | ||||||
|  |     type: sum | ||||||
|  |     sql: ${purchases.total_amount};; | ||||||
|  |     value_format_name: usd | ||||||
|  |     description: "Total lifetime revenue from purchases by user" | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension: user_purchase_status{ | ||||||
|  |     type: string | ||||||
|  |     sql: | ||||||
|  |       CASE | ||||||
|  |         WHEN ${user_metrics.purchase_count} <= 1 THEN 'First Purchase' | ||||||
|  |         WHEN ${user_metrics.purchase_count} <= 3 THEN 'Early Customer' | ||||||
|  |         WHEN ${user_metrics.purchase_count} <= 10 THEN 'Regular Customer' | ||||||
|  |         ELSE 'Loyal Customer' | ||||||
|  |       END ;; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   dimension_group: user_age { | ||||||
|  |     type: duration | ||||||
|  |     sql_start: ${TABLE}."CREATED_AT" ;; | ||||||
|  |     sql_end: CURRENT_TIMESTAMP ;; | ||||||
|  |     intervals: [day, week, month, quarter, year] | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   measure: count { | ||||||
|  |     type: count | ||||||
|  |   } | ||||||
|  | } | ||||||
| @ -1285,3 +1285,181 @@ def test_unreachable_views(pytestconfig): | |||||||
|         "The Looker view file was skipped because it may not be referenced by any models." |         "The Looker view file was skipped because it may not be referenced by any models." | ||||||
|         in [failure.message for failure in source.get_report().warnings] |         in [failure.message for failure in source.get_report().warnings] | ||||||
|     ) |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time(FROZEN_TIME) | ||||||
|  | def test_col_lineage_looker_api_based(pytestconfig, tmp_path): | ||||||
|  |     test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" | ||||||
|  |     golden_path = test_resources_dir / "lkml_col_lineage_looker_api_based_golden.json" | ||||||
|  |     mce_out_file = "lkml_col_lineage_looker_api_based.json" | ||||||
|  |     recipe = { | ||||||
|  |         "run_id": "lookml-test", | ||||||
|  |         "source": { | ||||||
|  |             "type": "lookml", | ||||||
|  |             "config": { | ||||||
|  |                 "base_folder": f"{test_resources_dir}/lkml_col_lineage_sample", | ||||||
|  |                 "connection_to_platform_map": {"my_connection": "postgres"}, | ||||||
|  |                 "parse_table_names_from_sql": True, | ||||||
|  |                 "tag_measures_and_dimensions": False, | ||||||
|  |                 "project_name": "lkml_col_lineage_sample", | ||||||
|  |                 "use_api_for_view_lineage": True, | ||||||
|  |                 "api": { | ||||||
|  |                     "client_id": "fake_client_id", | ||||||
|  |                     "client_secret": "fake_secret", | ||||||
|  |                     "base_url": "fake_account.looker.com", | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         "sink": { | ||||||
|  |             "type": "file", | ||||||
|  |             "config": { | ||||||
|  |                 "filename": f"{tmp_path / mce_out_file}", | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     # Mock SQL responses based on the dump file | ||||||
|  |     mock_sql_responses = { | ||||||
|  |         # For user_metrics view (fields starting with user_metrics.) | ||||||
|  |         "user_metrics": """WITH user_metrics AS (SELECT | ||||||
|  |            user_fk as user_id, | ||||||
|  |            COUNT(DISTINCT pk) as purchase_count, | ||||||
|  |            SUM(total_amount) as total_spent | ||||||
|  |          FROM "ECOMMERCE"."PURCHASES" | ||||||
|  |          GROUP BY user_id ) | ||||||
|  | SELECT | ||||||
|  |     user_metrics.user_id  AS "user_metrics.user_id", | ||||||
|  |     user_metrics.purchase_count  AS "user_metrics.purchase_count", | ||||||
|  |     user_metrics.total_spent  AS "user_metrics.total_spent", | ||||||
|  |     CASE | ||||||
|  |            WHEN user_metrics.total_spent > 1000 THEN 'High Value' | ||||||
|  |            WHEN user_metrics.total_spent > 500 THEN 'Medium Value' | ||||||
|  |            ELSE 'Low Value' | ||||||
|  |          END  AS "user_metrics.customer_segment", | ||||||
|  |     COUNT(DISTINCT CASE WHEN  user_metrics.total_spent   > 1000 THEN ( users."PK"  ) END ) AS "user_metrics.high_value_customer_count" | ||||||
|  | FROM "ECOMMERCE"."USERS"  AS customer_analysis | ||||||
|  | LEFT JOIN user_metrics ON user_metrics.user_id = (customer_analysis."PK") | ||||||
|  | INNER JOIN "ECOMMERCE"."USERS"  AS users ON (customer_analysis."PK") = (users."PK") | ||||||
|  | GROUP BY | ||||||
|  |     1, | ||||||
|  |     2, | ||||||
|  |     3, | ||||||
|  |     4 | ||||||
|  | ORDER BY | ||||||
|  |     5 DESC | ||||||
|  | FETCH NEXT 1 ROWS ONLY""", | ||||||
|  |         # For users view (fields starting with users.) | ||||||
|  |         "users": """WITH user_metrics AS (SELECT | ||||||
|  |            user_fk as user_id, | ||||||
|  |            COUNT(DISTINCT pk) as purchase_count, | ||||||
|  |            SUM(total_amount) as total_spent | ||||||
|  |          FROM "ECOMMERCE"."PURCHASES" | ||||||
|  |          GROUP BY user_id ) | ||||||
|  | SELECT | ||||||
|  |     users."EMAIL"  AS "users.email", | ||||||
|  |     users."PK"  AS "users.pk", | ||||||
|  |     CASE | ||||||
|  |         WHEN user_metrics.purchase_count <= 1 THEN 'First Purchase' | ||||||
|  |         WHEN user_metrics.purchase_count <= 3 THEN 'Early Customer' | ||||||
|  |         WHEN user_metrics.purchase_count <= 10 THEN 'Regular Customer' | ||||||
|  |         ELSE 'Loyal Customer' | ||||||
|  |       END  AS "users.user_purchase_status", | ||||||
|  |     users."CREATED_AT"  AS "users.created_raw", | ||||||
|  |     users."UPDATED_AT"  AS "users.updated_raw", | ||||||
|  |     (TIMESTAMPDIFF(DAY, CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ)), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) + CASE WHEN TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) = TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ))) THEN 0 WHEN TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) < TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ))) THEN CASE WHEN CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ)) < CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ)) THEN -1 ELSE 0 END ELSE CASE WHEN CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(users."CREATED_AT"  AS TIMESTAMP_NTZ)) > CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ)) THEN 1 ELSE 0 END END) AS "users.days_user_age", | ||||||
|  |     COUNT(DISTINCT ( purchases."PK"  ) ) AS "users.lifetime_purchase_count", | ||||||
|  |     COALESCE(CAST( ( SUM(DISTINCT (CAST(FLOOR(COALESCE( ( purchases."TOTAL_AMOUNT" ) ,0)*(1000000*1.0)) AS DECIMAL(38,0))) + (TO_NUMBER(MD5( users."PK"  ), 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX') % 1.0e27)::NUMERIC(38, 0) ) - SUM(DISTINCT (TO_NUMBER(MD5( users."PK"  ), 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX') % 1.0e27)::NUMERIC(38, 0)) )  AS DOUBLE PRECISION) / CAST((1000000*1.0) AS DOUBLE PRECISION), 0) AS "users.lifetime_total_purchase_amount", | ||||||
|  |     COUNT(DISTINCT users."PK" ) AS "users.count" | ||||||
|  | FROM "ECOMMERCE"."USERS"  AS customer_analysis | ||||||
|  | LEFT JOIN "ECOMMERCE"."PURCHASES"  AS purchases ON (customer_analysis."PK") = (purchases."USER_FK") | ||||||
|  | LEFT JOIN user_metrics ON user_metrics.user_id = (customer_analysis."PK") | ||||||
|  | INNER JOIN "ECOMMERCE"."USERS"  AS users ON (customer_analysis."PK") = (users."PK") | ||||||
|  | GROUP BY | ||||||
|  |     1, | ||||||
|  |     2, | ||||||
|  |     3, | ||||||
|  |     4, | ||||||
|  |     5, | ||||||
|  |     6 | ||||||
|  | ORDER BY | ||||||
|  |     7 DESC | ||||||
|  | FETCH NEXT 1 ROWS ONLY""", | ||||||
|  |         # For purchases view (fields starting with purchases.) | ||||||
|  |         "purchases": """SELECT | ||||||
|  |     purchases."PK"  AS "purchases.pk", | ||||||
|  |     purchases."PURCHASE_AMOUNT"  AS "purchases.purchase_amount", | ||||||
|  |     purchases."STATUS"  AS "purchases.status", | ||||||
|  |     purchases."TAX_AMOUNT"  AS "purchases.tax_amount", | ||||||
|  |     purchases."TOTAL_AMOUNT"  AS "purchases.total_amount", | ||||||
|  |     purchases."USER_FK"  AS "purchases.user_fk", | ||||||
|  |         (CASE WHEN (purchases."TOTAL_AMOUNT") > 100  THEN 'Yes' ELSE 'No' END) AS "purchases.is_expensive_purchase", | ||||||
|  |     (users."EMAIL")  AS "purchases.user_email", | ||||||
|  |     purchases."CREATED_AT"  AS "purchases.created_raw", | ||||||
|  |     purchases."UPDATED_AT"  AS "purchases.updated_raw", | ||||||
|  |     (TIMESTAMPDIFF(DAY, CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ)), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) + CASE WHEN TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) = TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ))) THEN 0 WHEN TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ))) < TIMESTAMPDIFF(SECOND, TO_DATE(CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ))), CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ))) THEN CASE WHEN CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ)) < CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ)) THEN -1 ELSE 0 END ELSE CASE WHEN CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(purchases."CREATED_AT"  AS TIMESTAMP_NTZ)) > CONVERT_TIMEZONE('UTC', 'America/Los_Angeles', CAST(CURRENT_TIMESTAMP  AS TIMESTAMP_NTZ)) THEN 1 ELSE 0 END END) AS "purchases.days_purchase_age", | ||||||
|  |     COUNT(purchases."PK" ) AS "purchases.num_of_expensive_purchases", | ||||||
|  |     AVG(( purchases."TOTAL_AMOUNT"  ) ) AS "purchases.average_purchase_value", | ||||||
|  |     COUNT(purchases."PK" ) AS "purchases.count" | ||||||
|  | FROM "ECOMMERCE"."USERS"  AS customer_analysis | ||||||
|  | LEFT JOIN "ECOMMERCE"."PURCHASES"  AS purchases ON (customer_analysis."PK") = (purchases."USER_FK") | ||||||
|  | INNER JOIN "ECOMMERCE"."USERS"  AS users ON (customer_analysis."PK") = (users."PK") | ||||||
|  | GROUP BY | ||||||
|  |     1, | ||||||
|  |     2, | ||||||
|  |     3, | ||||||
|  |     4, | ||||||
|  |     5, | ||||||
|  |     6, | ||||||
|  |     7, | ||||||
|  |     8, | ||||||
|  |     9, | ||||||
|  |     10, | ||||||
|  |     11 | ||||||
|  | ORDER BY | ||||||
|  |     12 DESC | ||||||
|  | FETCH NEXT 1 ROWS ONLY""", | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     def mock_run_inline_query( | ||||||
|  |         body, result_format=None, transport_options=None, cache=None | ||||||
|  |     ): | ||||||
|  |         # Determine which view is being queried based on the fields | ||||||
|  |         write_query = body | ||||||
|  |         if write_query.fields and any( | ||||||
|  |             field.startswith("user_metrics.") for field in write_query.fields | ||||||
|  |         ): | ||||||
|  |             return mock_sql_responses["user_metrics"] | ||||||
|  |         elif write_query.fields and any( | ||||||
|  |             field.startswith("users.") for field in write_query.fields | ||||||
|  |         ): | ||||||
|  |             return mock_sql_responses["users"] | ||||||
|  |         elif write_query.fields and any( | ||||||
|  |             field.startswith("purchases.") for field in write_query.fields | ||||||
|  |         ): | ||||||
|  |             return mock_sql_responses["purchases"] | ||||||
|  |         else: | ||||||
|  |             # Default fallback | ||||||
|  |             return mock_sql_responses["user_metrics"] | ||||||
|  | 
 | ||||||
|  |     mock_connection = DBConnection( | ||||||
|  |         dialect_name="postgres", | ||||||
|  |         database="my_database", | ||||||
|  |     ) | ||||||
|  |     mock_model = mock.MagicMock(project_name="lkml_col_lineage_sample") | ||||||
|  | 
 | ||||||
|  |     mocked_client = mock.MagicMock() | ||||||
|  |     mocked_client.run_inline_query.side_effect = mock_run_inline_query | ||||||
|  |     mocked_client.connection.return_value = mock_connection | ||||||
|  |     mocked_client.lookml_model.return_value = mock_model | ||||||
|  | 
 | ||||||
|  |     with mock.patch("looker_sdk.init40", return_value=mocked_client): | ||||||
|  |         pipeline = Pipeline.create(recipe) | ||||||
|  |         pipeline.run() | ||||||
|  |         pipeline.pretty_print_summary() | ||||||
|  |         pipeline.raise_from_status(raise_warnings=True) | ||||||
|  | 
 | ||||||
|  |     mce_helpers.check_golden_file( | ||||||
|  |         pytestconfig, | ||||||
|  |         output_path=tmp_path / mce_out_file, | ||||||
|  |         golden_path=golden_path, | ||||||
|  |     ) | ||||||
|  | |||||||
							
								
								
									
										0
									
								
								metadata-ingestion/tests/unit/lookml/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								metadata-ingestion/tests/unit/lookml/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,518 @@ | |||||||
|  | from unittest.mock import MagicMock, patch | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | from looker_sdk.sdk.api40.models import WriteQuery | ||||||
|  | 
 | ||||||
|  | from datahub.ingestion.api.common import PipelineContext | ||||||
|  | from datahub.ingestion.source.looker.looker_common import ( | ||||||
|  |     LookerViewId, | ||||||
|  |     ViewField, | ||||||
|  |     ViewFieldType, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source.looker.looker_constant import ( | ||||||
|  |     NAME, | ||||||
|  |     VIEW_FIELD_INTERVALS_ATTRIBUTE, | ||||||
|  |     VIEW_FIELD_TIMEFRAMES_ATTRIBUTE, | ||||||
|  |     VIEW_FIELD_TYPE_ATTRIBUTE, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI | ||||||
|  | from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache | ||||||
|  | from datahub.ingestion.source.looker.lookml_concept_context import ( | ||||||
|  |     LookerFieldContext, | ||||||
|  |     LookerViewContext, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source.looker.lookml_config import ( | ||||||
|  |     LookMLSourceConfig, | ||||||
|  |     LookMLSourceReport, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source.looker.view_upstream import ( | ||||||
|  |     LookerQueryAPIBasedViewUpstream, | ||||||
|  | ) | ||||||
|  | from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_mock_sql_parsing_result( | ||||||
|  |     table_error=None, column_error=None, in_tables=None, column_lineage=None | ||||||
|  | ): | ||||||
|  |     """Helper function to create a properly mocked SqlParsingResult.""" | ||||||
|  |     mock_spr = MagicMock(spec=SqlParsingResult) | ||||||
|  |     mock_debug_info = MagicMock() | ||||||
|  |     mock_debug_info.table_error = table_error | ||||||
|  |     mock_debug_info.column_error = column_error | ||||||
|  |     mock_spr.debug_info = mock_debug_info | ||||||
|  |     mock_spr.in_tables = in_tables or [] | ||||||
|  |     mock_spr.column_lineage = column_lineage or [] | ||||||
|  |     return mock_spr | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestLookMLAPIBasedViewUpstream: | ||||||
|  |     """Test suite for LookerQueryAPIBasedViewUpstream functionality.""" | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_view_context(self): | ||||||
|  |         """Create a mock LookerViewContext for testing.""" | ||||||
|  |         view_context = MagicMock(spec=LookerViewContext) | ||||||
|  |         view_context.name.return_value = "test_view" | ||||||
|  |         view_context.base_folder_path = "/test/path" | ||||||
|  |         view_context.dimensions.return_value = [ | ||||||
|  |             {NAME: "user_id", "type": "string"}, | ||||||
|  |             {NAME: "email", "type": "string"}, | ||||||
|  |         ] | ||||||
|  |         view_context.measures.return_value = [ | ||||||
|  |             {NAME: "total_users", "type": "number"}, | ||||||
|  |         ] | ||||||
|  |         view_context.dimension_groups.return_value = [] | ||||||
|  | 
 | ||||||
|  |         # Mock view_connection | ||||||
|  |         mock_connection = MagicMock() | ||||||
|  |         mock_connection.default_schema = "public" | ||||||
|  |         mock_connection.default_db = "test_db" | ||||||
|  |         mock_connection.platform = "postgres" | ||||||
|  |         mock_connection.platform_instance = None | ||||||
|  |         mock_connection.platform_env = None | ||||||
|  |         view_context.view_connection = mock_connection | ||||||
|  | 
 | ||||||
|  |         return view_context | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_looker_view_id_cache(self): | ||||||
|  |         """Create a mock LookerViewIdCache for testing.""" | ||||||
|  |         cache = MagicMock(spec=LookerViewIdCache) | ||||||
|  |         cache.model_name = "test_model" | ||||||
|  |         return cache | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_config(self): | ||||||
|  |         """Create a mock LookMLSourceConfig for testing.""" | ||||||
|  |         config = MagicMock(spec=LookMLSourceConfig) | ||||||
|  |         config.use_api_for_view_lineage = True | ||||||
|  |         config.use_api_cache_for_view_lineage = False | ||||||
|  |         config.env = "PROD" | ||||||
|  |         return config | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_reporter(self): | ||||||
|  |         """Create a mock LookMLSourceReport for testing.""" | ||||||
|  |         return MagicMock(spec=LookMLSourceReport) | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_ctx(self): | ||||||
|  |         """Create a mock PipelineContext for testing.""" | ||||||
|  |         ctx = MagicMock(spec=PipelineContext) | ||||||
|  |         ctx.graph = MagicMock() | ||||||
|  |         return ctx | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def mock_looker_client(self): | ||||||
|  |         """Create a mock LookerAPI client for testing.""" | ||||||
|  |         client = MagicMock(spec=LookerAPI) | ||||||
|  |         return client | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def view_to_explore_map(self): | ||||||
|  |         """Create a view to explore mapping for testing.""" | ||||||
|  |         return {"test_view": "test_explore"} | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def upstream_instance( | ||||||
|  |         self, | ||||||
|  |         mock_view_context, | ||||||
|  |         mock_looker_view_id_cache, | ||||||
|  |         mock_config, | ||||||
|  |         mock_reporter, | ||||||
|  |         mock_ctx, | ||||||
|  |         mock_looker_client, | ||||||
|  |         view_to_explore_map, | ||||||
|  |     ): | ||||||
|  |         """Create a LookerQueryAPIBasedViewUpstream instance for testing.""" | ||||||
|  |         # Mock the API response to prevent initialization errors | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = [ | ||||||
|  |             {"sql": "SELECT test_view.user_id FROM test_table"} | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         # Mock the view ID cache | ||||||
|  |         mock_view_id = MagicMock(spec=LookerViewId) | ||||||
|  |         mock_view_id.get_urn.return_value = "urn:li:dataset:test" | ||||||
|  |         mock_looker_view_id_cache.get_looker_view_id.return_value = mock_view_id | ||||||
|  | 
 | ||||||
|  |         with patch( | ||||||
|  |             "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |         ) as mock_create_lineage: | ||||||
|  |             # Mock successful SQL parsing | ||||||
|  |             mock_spr = create_mock_sql_parsing_result() | ||||||
|  |             mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |             return LookerQueryAPIBasedViewUpstream( | ||||||
|  |                 view_context=mock_view_context, | ||||||
|  |                 looker_view_id_cache=mock_looker_view_id_cache, | ||||||
|  |                 config=mock_config, | ||||||
|  |                 reporter=mock_reporter, | ||||||
|  |                 ctx=mock_ctx, | ||||||
|  |                 looker_client=mock_looker_client, | ||||||
|  |                 view_to_explore_map=view_to_explore_map, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def test_time_dimension_group_handling(self, upstream_instance): | ||||||
|  |         """Test that time dimension groups are handled correctly.""" | ||||||
|  |         dim_group = { | ||||||
|  |             NAME: "created", | ||||||
|  |             VIEW_FIELD_TYPE_ATTRIBUTE: "time", | ||||||
|  |             VIEW_FIELD_TIMEFRAMES_ATTRIBUTE: ["date", "week", "month"], | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance._get_time_dim_group_field_name(dim_group) | ||||||
|  |         assert result == "created_date" | ||||||
|  | 
 | ||||||
|  |     def test_time_dimension_group_without_timeframes(self, upstream_instance): | ||||||
|  |         """Test time dimension group handling when timeframes are not specified.""" | ||||||
|  |         dim_group = { | ||||||
|  |             NAME: "created", | ||||||
|  |             VIEW_FIELD_TYPE_ATTRIBUTE: "time", | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance._get_time_dim_group_field_name(dim_group) | ||||||
|  |         assert result == "created_raw" | ||||||
|  | 
 | ||||||
|  |     def test_duration_dimension_group_handling(self, upstream_instance): | ||||||
|  |         """Test that duration dimension groups are handled correctly.""" | ||||||
|  |         dim_group = { | ||||||
|  |             NAME: "since_event", | ||||||
|  |             VIEW_FIELD_TYPE_ATTRIBUTE: "duration", | ||||||
|  |             VIEW_FIELD_INTERVALS_ATTRIBUTE: ["hour", "day", "week"], | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance._get_duration_dim_group_field_name(dim_group) | ||||||
|  |         assert result == "hours_since_event" | ||||||
|  | 
 | ||||||
|  |     def test_duration_dimension_group_without_intervals(self, upstream_instance): | ||||||
|  |         """Test duration dimension group handling when intervals are not specified.""" | ||||||
|  |         dim_group = { | ||||||
|  |             NAME: "since_event", | ||||||
|  |             VIEW_FIELD_TYPE_ATTRIBUTE: "duration", | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance._get_duration_dim_group_field_name(dim_group) | ||||||
|  |         assert result == "days_since_event" | ||||||
|  | 
 | ||||||
|  |     def test_get_looker_api_field_name(self, upstream_instance): | ||||||
|  |         """Test field name translation to Looker API format.""" | ||||||
|  |         result = upstream_instance._get_looker_api_field_name("user_id") | ||||||
|  |         assert result == "test_view.user_id" | ||||||
|  | 
 | ||||||
|  |     def test_get_field_name_from_looker_api_field_name(self, upstream_instance): | ||||||
|  |         """Test field name translation from Looker API format.""" | ||||||
|  |         result = upstream_instance._get_field_name_from_looker_api_field_name( | ||||||
|  |             "test_view.user_id" | ||||||
|  |         ) | ||||||
|  |         assert result == "user_id" | ||||||
|  | 
 | ||||||
|  |     def test_get_field_name_from_looker_api_field_name_mismatch( | ||||||
|  |         self, upstream_instance | ||||||
|  |     ): | ||||||
|  |         """Test field name translation when view name doesn't match.""" | ||||||
|  |         result = upstream_instance._get_field_name_from_looker_api_field_name( | ||||||
|  |             "other_view.user_id" | ||||||
|  |         ) | ||||||
|  |         assert result == "other_view.user_id" | ||||||
|  | 
 | ||||||
|  |     def test_get_sql_write_query_success(self, upstream_instance): | ||||||
|  |         """Test successful WriteQuery construction.""" | ||||||
|  |         query = upstream_instance._get_sql_write_query() | ||||||
|  | 
 | ||||||
|  |         assert isinstance(query, WriteQuery) | ||||||
|  |         assert query.model == "test_model" | ||||||
|  |         assert query.view == "test_explore" | ||||||
|  |         assert query.limit == "1" | ||||||
|  |         assert query.fields is not None | ||||||
|  |         assert "test_view.user_id" in query.fields | ||||||
|  |         assert "test_view.email" in query.fields | ||||||
|  |         assert "test_view.total_users" in query.fields | ||||||
|  | 
 | ||||||
|  |     def test_get_sql_write_query_no_fields(self, upstream_instance, mock_view_context): | ||||||
|  |         """Test WriteQuery construction when no fields are found.""" | ||||||
|  |         mock_view_context.dimensions.return_value = [] | ||||||
|  |         mock_view_context.measures.return_value = [] | ||||||
|  |         mock_view_context.dimension_groups.return_value = [] | ||||||
|  | 
 | ||||||
|  |         with pytest.raises(ValueError, match="No fields found for view"): | ||||||
|  |             upstream_instance._get_sql_write_query() | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_execute_query_success( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test successful query execution.""" | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = "SELECT test_view.user_id FROM test_table" | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result | ||||||
|  |         mock_spr = create_mock_sql_parsing_result( | ||||||
|  |             in_tables=["urn:li:dataset:(urn:li:dataPlatform:postgres,test_table,PROD)"] | ||||||
|  |         ) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance._execute_query(MagicMock(spec=WriteQuery)) | ||||||
|  |         assert result == "SELECT test_view.user_id FROM test_table" | ||||||
|  | 
 | ||||||
|  |     def test_execute_query_no_sql_response(self, upstream_instance, mock_looker_client): | ||||||
|  |         """Test query execution when no SQL is returned.""" | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = [] | ||||||
|  | 
 | ||||||
|  |         with pytest.raises(ValueError, match="No SQL found in response"): | ||||||
|  |             upstream_instance._execute_query(MagicMock(spec=WriteQuery)) | ||||||
|  | 
 | ||||||
|  |     def test_execute_query_invalid_response_format( | ||||||
|  |         self, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test query execution with invalid response format.""" | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = None | ||||||
|  | 
 | ||||||
|  |         with pytest.raises(ValueError, match="No SQL found in response"): | ||||||
|  |             upstream_instance._execute_query(MagicMock(spec=WriteQuery)) | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_get_spr_table_error( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test SQL parsing result when table extraction fails.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT * FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result with table error | ||||||
|  |         mock_spr = create_mock_sql_parsing_result( | ||||||
|  |             table_error=Exception("Table parsing failed") | ||||||
|  |         ) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         with pytest.raises( | ||||||
|  |             ValueError, match="Error in parsing SQL for upstream tables" | ||||||
|  |         ): | ||||||
|  |             upstream_instance._get_spr() | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_get_spr_column_error( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test SQL parsing result when column extraction fails.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT * FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result with column error | ||||||
|  |         mock_spr = create_mock_sql_parsing_result( | ||||||
|  |             column_error=Exception("Column parsing failed") | ||||||
|  |         ) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         with pytest.raises(ValueError, match="Error in parsing SQL for column lineage"): | ||||||
|  |             upstream_instance._get_spr() | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_get_upstream_dataset_urn( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test upstream dataset URN extraction.""" | ||||||
|  |         # Clear all caches to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  |         upstream_instance._get_upstream_dataset_urn.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT * FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result | ||||||
|  |         mock_spr = create_mock_sql_parsing_result( | ||||||
|  |             in_tables=["urn:li:dataset:(urn:li:dataPlatform:postgres,test_table,PROD)"] | ||||||
|  |         ) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance.get_upstream_dataset_urn() | ||||||
|  |         assert len(result) == 1 | ||||||
|  |         assert "test_table" in result[0] | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_get_upstream_column_ref( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test upstream column reference extraction.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT user_id FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result with column lineage | ||||||
|  |         mock_column_lineage = [ | ||||||
|  |             MagicMock( | ||||||
|  |                 downstream=MagicMock(column="test_view.user_id"), | ||||||
|  |                 upstreams=[MagicMock(table="test_table", column="user_id")], | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |         mock_spr = create_mock_sql_parsing_result(column_lineage=mock_column_lineage) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         # Mock field context | ||||||
|  |         field_context = MagicMock(spec=LookerFieldContext) | ||||||
|  |         field_context.name.return_value = "user_id" | ||||||
|  |         field_context.raw_field = {NAME: "user_id"} | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance.get_upstream_column_ref(field_context) | ||||||
|  |         assert len(result) == 1 | ||||||
|  |         assert result[0].table == "test_table" | ||||||
|  |         assert result[0].column == "user_id" | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_get_upstream_column_ref_dimension_group( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test upstream column reference extraction for dimension groups.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT created_date FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result with column lineage | ||||||
|  |         mock_column_lineage = [ | ||||||
|  |             MagicMock( | ||||||
|  |                 downstream=MagicMock(column="test_view.created_date"), | ||||||
|  |                 upstreams=[MagicMock(table="test_table", column="created_at")], | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |         mock_spr = create_mock_sql_parsing_result(column_lineage=mock_column_lineage) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         # Mock field context for time dimension group | ||||||
|  |         field_context = MagicMock(spec=LookerFieldContext) | ||||||
|  |         field_context.name.return_value = "created" | ||||||
|  |         field_context.raw_field = { | ||||||
|  |             NAME: "created", | ||||||
|  |             VIEW_FIELD_TYPE_ATTRIBUTE: "time", | ||||||
|  |             VIEW_FIELD_TIMEFRAMES_ATTRIBUTE: ["date"], | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance.get_upstream_column_ref(field_context) | ||||||
|  |         assert len(result) == 1 | ||||||
|  |         assert result[0].table == "test_table" | ||||||
|  |         assert result[0].column == "created_at" | ||||||
|  | 
 | ||||||
|  |     @patch( | ||||||
|  |         "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |     ) | ||||||
|  |     def test_create_fields( | ||||||
|  |         self, mock_create_lineage, upstream_instance, mock_looker_client | ||||||
|  |     ): | ||||||
|  |         """Test ViewField creation from SQL parsing result.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT user_id FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL parsing result with column lineage | ||||||
|  |         mock_column_lineage = [ | ||||||
|  |             MagicMock( | ||||||
|  |                 downstream=MagicMock( | ||||||
|  |                     column="test_view.user_id", native_column_type="string" | ||||||
|  |                 ), | ||||||
|  |                 upstreams=[MagicMock(table="test_table", column="user_id")], | ||||||
|  |             ) | ||||||
|  |         ] | ||||||
|  |         mock_spr = create_mock_sql_parsing_result(column_lineage=mock_column_lineage) | ||||||
|  |         mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |         result = upstream_instance.create_fields() | ||||||
|  |         assert len(result) == 1 | ||||||
|  |         assert isinstance(result[0], ViewField) | ||||||
|  |         assert result[0].name == "user_id" | ||||||
|  |         assert result[0].type == "string" | ||||||
|  |         assert result[0].field_type == ViewFieldType.UNKNOWN | ||||||
|  | 
 | ||||||
|  |     def test_create_fields_no_column_lineage(self, upstream_instance): | ||||||
|  |         """Test ViewField creation when no column lineage is available.""" | ||||||
|  |         # Mock the SQL parsing result without column lineage | ||||||
|  |         mock_spr = MagicMock(spec=SqlParsingResult) | ||||||
|  |         mock_spr.column_lineage = None | ||||||
|  | 
 | ||||||
|  |         with patch.object(upstream_instance, "_get_spr", return_value=mock_spr): | ||||||
|  |             result = upstream_instance.create_fields() | ||||||
|  |             assert result == [] | ||||||
|  | 
 | ||||||
|  |     def test_api_failure_fallback( | ||||||
|  |         self, | ||||||
|  |         mock_view_context, | ||||||
|  |         mock_looker_view_id_cache, | ||||||
|  |         mock_config, | ||||||
|  |         mock_reporter, | ||||||
|  |         mock_ctx, | ||||||
|  |         mock_looker_client, | ||||||
|  |         view_to_explore_map, | ||||||
|  |     ): | ||||||
|  |         """Test that API failures are handled gracefully.""" | ||||||
|  |         # Mock the Looker client to raise an exception | ||||||
|  |         mock_looker_client.generate_sql_query.side_effect = Exception("API call failed") | ||||||
|  | 
 | ||||||
|  |         # This should not raise an exception, but should be handled by the fallback mechanism | ||||||
|  |         # in the factory function | ||||||
|  |         with pytest.raises(Exception, match="API call failed"): | ||||||
|  |             LookerQueryAPIBasedViewUpstream( | ||||||
|  |                 view_context=mock_view_context, | ||||||
|  |                 looker_view_id_cache=mock_looker_view_id_cache, | ||||||
|  |                 config=mock_config, | ||||||
|  |                 reporter=mock_reporter, | ||||||
|  |                 ctx=mock_ctx, | ||||||
|  |                 looker_client=mock_looker_client, | ||||||
|  |                 view_to_explore_map=view_to_explore_map, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def test_latency_tracking( | ||||||
|  |         self, upstream_instance, mock_looker_client, mock_reporter | ||||||
|  |     ): | ||||||
|  |         """Test that API latency is tracked and reported.""" | ||||||
|  |         # Clear the cache to force re-execution | ||||||
|  |         upstream_instance._get_spr.cache_clear() | ||||||
|  | 
 | ||||||
|  |         # Mock the SQL response | ||||||
|  |         mock_sql_response = [{"sql": "SELECT * FROM test_table"}] | ||||||
|  |         mock_looker_client.generate_sql_query.return_value = mock_sql_response | ||||||
|  | 
 | ||||||
|  |         # Mock the view ID cache to return a valid view ID | ||||||
|  |         mock_view_id = MagicMock(spec=LookerViewId) | ||||||
|  |         mock_view_id.get_urn.return_value = "urn:li:dataset:test" | ||||||
|  |         upstream_instance.looker_view_id_cache.get_looker_view_id.return_value = ( | ||||||
|  |             mock_view_id | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         with patch( | ||||||
|  |             "datahub.ingestion.source.looker.view_upstream.create_lineage_sql_parsed_result" | ||||||
|  |         ) as mock_create_lineage: | ||||||
|  |             mock_spr = create_mock_sql_parsing_result() | ||||||
|  |             mock_create_lineage.return_value = mock_spr | ||||||
|  | 
 | ||||||
|  |             upstream_instance._execute_query(MagicMock(spec=WriteQuery)) | ||||||
|  | 
 | ||||||
|  |             # Verify that latency was reported (may be called multiple times due to caching) | ||||||
|  |             assert mock_reporter.report_looker_query_api_latency.call_count >= 1 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Anush Kumar
						Anush Kumar