| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  | #  Copyright 2021 Collate | 
					
						
							|  |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  | #  http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | Helper module to handle data sampling | 
					
						
							|  |  |  | for the profiler | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  | from typing import Dict, Optional, Union | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 19:13:44 +01:00
										 |  |  | from sqlalchemy import Column | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  | from sqlalchemy import Table as SqaTable | 
					
						
							|  |  |  | from sqlalchemy import text | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  | from sqlalchemy.orm import Query | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  | from metadata.generated.schema.entity.data.table import ( | 
					
						
							|  |  |  |     ProfileSampleType, | 
					
						
							|  |  |  |     Table, | 
					
						
							|  |  |  |     TableType, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from metadata.generated.schema.entity.services.connections.connectionBasicType import ( | 
					
						
							|  |  |  |     DataStorageConfig, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( | 
					
						
							|  |  |  |     DatalakeConnection, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from metadata.generated.schema.entity.services.databaseService import DatabaseConnection | 
					
						
							|  |  |  | from metadata.ingestion.ometa.ometa_api import OpenMetadata | 
					
						
							|  |  |  | from metadata.sampler.models import SampleConfig | 
					
						
							|  |  |  | from metadata.sampler.sqlalchemy.sampler import SQASampler | 
					
						
							| 
									
										
										
										
											2023-11-09 18:49:42 +05:30
										 |  |  | from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class BigQuerySampler(SQASampler): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Generates a sample of the data to not | 
					
						
							|  |  |  |     run the query in the whole table. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-09 18:49:42 +05:30
										 |  |  |     # pylint: disable=too-many-arguments | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |     def __init__( | 
					
						
							|  |  |  |         self, | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  |         service_connection_config: Union[DatabaseConnection, DatalakeConnection], | 
					
						
							|  |  |  |         ometa_client: OpenMetadata, | 
					
						
							|  |  |  |         entity: Table, | 
					
						
							|  |  |  |         sample_config: Optional[SampleConfig] = None, | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |         partition_details: Optional[Dict] = None, | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  |         sample_query: Optional[str] = None, | 
					
						
							|  |  |  |         storage_config: DataStorageConfig = None, | 
					
						
							| 
									
										
										
										
											2023-11-09 18:49:42 +05:30
										 |  |  |         sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  |         **kwargs, | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |     ): | 
					
						
							|  |  |  |         super().__init__( | 
					
						
							| 
									
										
										
										
											2024-11-19 08:10:45 +01:00
										 |  |  |             service_connection_config=service_connection_config, | 
					
						
							|  |  |  |             ometa_client=ometa_client, | 
					
						
							|  |  |  |             entity=entity, | 
					
						
							|  |  |  |             sample_config=sample_config, | 
					
						
							|  |  |  |             partition_details=partition_details, | 
					
						
							|  |  |  |             sample_query=sample_query, | 
					
						
							|  |  |  |             storage_config=storage_config, | 
					
						
							|  |  |  |             sample_data_count=sample_data_count, | 
					
						
							|  |  |  |             **kwargs, | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-11 10:46:34 -08:00
										 |  |  |         self.raw_dataset_type: Optional[TableType] = entity.tableType | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def set_tablesample(self, selectable: SqaTable): | 
					
						
							|  |  |  |         """Set the TABLESAMPLE clause for BigQuery
 | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             selectable (Table): Table object | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         if ( | 
					
						
							| 
									
										
										
										
											2025-02-04 10:40:40 +01:00
										 |  |  |             self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  |             and self.raw_dataset_type != TableType.View | 
					
						
							|  |  |  |         ): | 
					
						
							|  |  |  |             return selectable.tablesample( | 
					
						
							| 
									
										
										
										
											2025-02-04 10:40:40 +01:00
										 |  |  |                 text(f"{self.sample_config.profileSample or 100} PERCENT") | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return selectable | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 19:13:44 +01:00
										 |  |  |     def _base_sample_query(self, column: Optional[Column], label=None): | 
					
						
							|  |  |  |         """Base query for sampling
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             column (Optional[Column]): if computing a column metric only sample for the column | 
					
						
							|  |  |  |             label (_type_, optional): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         # pylint: disable=import-outside-toplevel | 
					
						
							|  |  |  |         from sqlalchemy_bigquery import STRUCT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if column is not None: | 
					
						
							|  |  |  |             column_parts = column.name.split(".") | 
					
						
							|  |  |  |             if len(column_parts) > 1: | 
					
						
							|  |  |  |                 # for struct columns (e.g. `foo.bar`) we need to create a new column corresponding to | 
					
						
							|  |  |  |                 # the struct (e.g. `foo`) and then use that in the sample query as the column that | 
					
						
							|  |  |  |                 # will be query is `foo.bar`. | 
					
						
							|  |  |  |                 # e.g. WITH sample AS (SELECT `foo` FROM table) SELECT `foo.bar` | 
					
						
							|  |  |  |                 # FROM sample TABLESAMPLE SYSTEM (n PERCENT) | 
					
						
							|  |  |  |                 column = Column(column_parts[0], STRUCT) | 
					
						
							|  |  |  |                 # pylint: disable=protected-access | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  |                 column._set_parent(self.raw_dataset.__table__) | 
					
						
							| 
									
										
										
										
											2023-12-27 19:13:44 +01:00
										 |  |  |                 # pylint: enable=protected-access | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return super()._base_sample_query(column, label=label) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def get_sample_query(self, *, column=None) -> Query: | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |         """get query for sample data""" | 
					
						
							|  |  |  |         # TABLESAMPLE SYSTEM is not supported for views | 
					
						
							|  |  |  |         if ( | 
					
						
							| 
									
										
										
										
											2025-02-04 10:40:40 +01:00
										 |  |  |             self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  |             and self.raw_dataset_type != TableType.View | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |         ): | 
					
						
							| 
									
										
										
										
											2024-11-27 08:50:54 +01:00
										 |  |  |             return self._base_sample_query(column).cte( | 
					
						
							|  |  |  |                 f"{self.raw_dataset.__tablename__}_sample" | 
					
						
							| 
									
										
										
										
											2023-07-13 13:35:37 +02:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-27 19:13:44 +01:00
										 |  |  |         return super().get_sample_query(column=column) |