mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 02:37:05 +00:00 
			
		
		
		
	fix(ingest): presto-on-hive - Adding catalog name to the presto on hive urn (#6024)
Co-authored-by: Shirshanka Das <shirshanka@apache.org>
This commit is contained in:
		
							parent
							
								
									e54f376f0a
								
							
						
					
					
						commit
						0545f3cb38
					
				| @ -121,6 +121,11 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig): | |||||||
|         description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']", |         description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']", | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     include_catalog_name_in_ids: bool = Field( | ||||||
|  |         default=False, | ||||||
|  |         description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|     def get_sql_alchemy_url(self, uri_opts: Optional[Dict[str, Any]] = None) -> str: |     def get_sql_alchemy_url(self, uri_opts: Optional[Dict[str, Any]] = None) -> str: | ||||||
|         if not ((self.host_port and self.scheme) or self.sqlalchemy_uri): |         if not ((self.host_port and self.scheme) or self.sqlalchemy_uri): | ||||||
|             raise ValueError("host_port and schema or connect_uri required.") |             raise ValueError("host_port and schema or connect_uri required.") | ||||||
| @ -407,9 +412,17 @@ class PrestoOnHiveSource(SQLAlchemySource): | |||||||
|         iter_res = self._alchemy_client.execute_query(statement) |         iter_res = self._alchemy_client.execute_query(statement) | ||||||
| 
 | 
 | ||||||
|         for key, group in groupby(iter_res, self._get_table_key): |         for key, group in groupby(iter_res, self._get_table_key): | ||||||
|             dataset_name = self.get_identifier( |             db_name = self.get_db_name(inspector) | ||||||
|                 schema=key.schema, entity=key.table, inspector=inspector |             schema_name = ( | ||||||
|  |                 f"{db_name}.{key.schema}" | ||||||
|  |                 if self.config.include_catalog_name_in_ids | ||||||
|  |                 else key.schema | ||||||
|             ) |             ) | ||||||
|  | 
 | ||||||
|  |             dataset_name = self.get_identifier( | ||||||
|  |                 schema=schema_name, entity=key.table, inspector=inspector | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|             self.report.report_entity_scanned(dataset_name, ent_type="table") |             self.report.report_entity_scanned(dataset_name, ent_type="table") | ||||||
| 
 | 
 | ||||||
|             if not sql_config.table_pattern.allowed(dataset_name): |             if not sql_config.table_pattern.allowed(dataset_name): | ||||||
| @ -521,8 +534,14 @@ class PrestoOnHiveSource(SQLAlchemySource): | |||||||
| 
 | 
 | ||||||
|         iter_res = self._alchemy_client.execute_query(statement) |         iter_res = self._alchemy_client.execute_query(statement) | ||||||
|         for key, group in groupby(iter_res, self._get_table_key): |         for key, group in groupby(iter_res, self._get_table_key): | ||||||
|  |             db_name = self.get_db_name(inspector) | ||||||
|  |             schema_name = ( | ||||||
|  |                 f"{db_name}.{key.schema}" | ||||||
|  |                 if self.config.include_catalog_name_in_ids | ||||||
|  |                 else key.schema | ||||||
|  |             ) | ||||||
|             dataset_name = self.get_identifier( |             dataset_name = self.get_identifier( | ||||||
|                 schema=key.schema, entity=key.table, inspector=inspector |                 schema=schema_name, entity=key.table, inspector=inspector | ||||||
|             ) |             ) | ||||||
|             columns = list(group) |             columns = list(group) | ||||||
| 
 | 
 | ||||||
| @ -553,8 +572,16 @@ class PrestoOnHiveSource(SQLAlchemySource): | |||||||
| 
 | 
 | ||||||
|         iter_res = self._alchemy_client.execute_query(statement) |         iter_res = self._alchemy_client.execute_query(statement) | ||||||
|         for row in iter_res: |         for row in iter_res: | ||||||
|  |             db_name = self.get_db_name(inspector) | ||||||
|  |             schema_name = ( | ||||||
|  |                 f"{db_name}.{row['schema']}" | ||||||
|  |                 if self.config.include_catalog_name_in_ids | ||||||
|  |                 else row["schema"] | ||||||
|  |             ) | ||||||
|             dataset_name = self.get_identifier( |             dataset_name = self.get_identifier( | ||||||
|                 schema=row["schema"], entity=row["name"], inspector=inspector |                 schema=schema_name, | ||||||
|  |                 entity=row["name"], | ||||||
|  |                 inspector=inspector, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             columns, view_definition = self._get_presto_view_column_metadata( |             columns, view_definition = self._get_presto_view_column_metadata( | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -53,10 +53,12 @@ def loaded_presto_on_hive(presto_on_hive_runner): | |||||||
| @freeze_time(FROZEN_TIME) | @freeze_time(FROZEN_TIME) | ||||||
| @pytest.mark.integration_batch_1 | @pytest.mark.integration_batch_1 | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "mode,use_catalog_subtype,use_dataset_pascalcase_subtype,test_suffix", |     "mode,use_catalog_subtype,use_dataset_pascalcase_subtype,include_catalog_name_in_ids,test_suffix", | ||||||
|     [ |     [ | ||||||
|         ("hive", False, False, "_1"), |         ("hive", False, False, False, "_1"), | ||||||
|         ("presto-on-hive", True, True, "_2"), |         ("presto-on-hive", True, True, False, "_2"), | ||||||
|  |         ("hive", False, False, True, "_3"), | ||||||
|  |         ("presto-on-hive", True, True, True, "_4"), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_presto_on_hive_ingest( | def test_presto_on_hive_ingest( | ||||||
| @ -68,6 +70,7 @@ def test_presto_on_hive_ingest( | |||||||
|     mode, |     mode, | ||||||
|     use_catalog_subtype, |     use_catalog_subtype, | ||||||
|     use_dataset_pascalcase_subtype, |     use_dataset_pascalcase_subtype, | ||||||
|  |     include_catalog_name_in_ids, | ||||||
|     test_suffix, |     test_suffix, | ||||||
| ): | ): | ||||||
| 
 | 
 | ||||||
| @ -91,6 +94,7 @@ def test_presto_on_hive_ingest( | |||||||
|                     "scheme": "postgresql+psycopg2", |                     "scheme": "postgresql+psycopg2", | ||||||
|                     "include_views": True, |                     "include_views": True, | ||||||
|                     "include_tables": True, |                     "include_tables": True, | ||||||
|  |                     "include_catalog_name_in_ids": include_catalog_name_in_ids, | ||||||
|                     "schema_pattern": {"allow": ["^public"]}, |                     "schema_pattern": {"allow": ["^public"]}, | ||||||
|                     "mode": mode, |                     "mode": mode, | ||||||
|                     "use_catalog_subtype": use_catalog_subtype, |                     "use_catalog_subtype": use_catalog_subtype, | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Tamas Nemeth
						Tamas Nemeth