| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2022-09-12 23:12:52 +05:30
										 |  |  | from unittest import mock | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | from pydantic import ValidationError | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  | from datahub.configuration.common import AllowDenyPattern | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | from datahub.configuration.time_window_config import BucketDuration, get_time_bucket | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  | from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | from datahub.emitter.mcp import MetadataChangeProposalWrapper | 
					
						
							|  |  |  | from datahub.ingestion.api.workunit import MetadataWorkUnit | 
					
						
							|  |  |  | from datahub.ingestion.source.usage.usage_common import ( | 
					
						
							|  |  |  |     BaseUsageConfig, | 
					
						
							|  |  |  |     GenericAggregatedDataset, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from datahub.metadata.schema_classes import DatasetUsageStatisticsClass | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  | _TestTableRef = str | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  | _TestAggregatedDataset = GenericAggregatedDataset[_TestTableRef] | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  | def _simple_urn_builder(resource): | 
					
						
							|  |  |  |     return make_dataset_urn_with_platform_instance( | 
					
						
							|  |  |  |         "snowflake", | 
					
						
							|  |  |  |         resource.lower(), | 
					
						
							|  |  |  |         "snowflake-dev", | 
					
						
							|  |  |  |         "DEV", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | def test_add_one_query_without_columns(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ta.queryCount == 1 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query] == 1 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email] == 1 | 
					
						
							|  |  |  |     assert len(ta.columnFreq) == 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  | def test_add_one_query_with_ignored_user(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ta = _TestAggregatedDataset( | 
					
						
							|  |  |  |         bucket_start_time=floored_ts, | 
					
						
							|  |  |  |         resource=resource, | 
					
						
							|  |  |  |         user_email_pattern=AllowDenyPattern(deny=list(["test_email@test.com"])), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ta.queryCount == 0 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query] == 0 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email] == 0 | 
					
						
							|  |  |  |     assert len(ta.columnFreq) == 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_multiple_query_with_ignored_user(): | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_email2 = "test_email2@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     test_query2 = "select * from test2" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  |     ta = _TestAggregatedDataset( | 
					
						
							|  |  |  |         bucket_start_time=floored_ts, | 
					
						
							|  |  |  |         resource=resource, | 
					
						
							|  |  |  |         user_email_pattern=AllowDenyPattern(deny=list(["test_email@test.com"])), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email2, | 
					
						
							|  |  |  |         test_query2, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ta.queryCount == 1 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query] == 0 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email] == 0 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query2] == 1 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email2] == 1 | 
					
						
							|  |  |  |     assert len(ta.columnFreq) == 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_multiple_query_without_columns(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_email2 = "test_email2@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     test_query2 = "select * from test2" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email2, | 
					
						
							|  |  |  |         test_query2, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ta.queryCount == 3 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query] == 2 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email] == 2 | 
					
						
							|  |  |  |     assert ta.queryFreq[test_query2] == 1 | 
					
						
							|  |  |  |     assert ta.userFreq[test_email2] == 1 | 
					
						
							|  |  |  |     assert len(ta.columnFreq) == 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_make_usage_workunit(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     wu: MetadataWorkUnit = ta.make_usage_workunit( | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  |         bucket_duration=BucketDuration.DAY, | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  |         urn_builder=_simple_urn_builder, | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  |         top_n_queries=10, | 
					
						
							|  |  |  |         format_sql_queries=False, | 
					
						
							| 
									
										
										
										
											2022-05-12 17:26:03 -04:00
										 |  |  |         include_top_n_queries=True, | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" | 
					
						
							|  |  |  |     assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) | 
					
						
							|  |  |  |     du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect | 
					
						
							|  |  |  |     assert du.totalSqlQueries == 1 | 
					
						
							|  |  |  |     assert du.topSqlQueries | 
					
						
							|  |  |  |     assert du.topSqlQueries.pop() == test_query | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  | def test_query_formatting(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_query = "select * from foo where id in (select id from bar);" | 
					
						
							|  |  |  |     formatted_test_query: str = "SELECT *\n  FROM foo\n WHERE id in (\n        SELECT id\n          FROM bar\n       );" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     wu: MetadataWorkUnit = ta.make_usage_workunit( | 
					
						
							|  |  |  |         bucket_duration=BucketDuration.DAY, | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  |         urn_builder=_simple_urn_builder, | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  |         top_n_queries=10, | 
					
						
							|  |  |  |         format_sql_queries=True, | 
					
						
							| 
									
										
										
										
											2022-05-12 17:26:03 -04:00
										 |  |  |         include_top_n_queries=True, | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  |     ) | 
					
						
							|  |  |  |     assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" | 
					
						
							|  |  |  |     assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) | 
					
						
							|  |  |  |     du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect | 
					
						
							|  |  |  |     assert du.totalSqlQueries == 1 | 
					
						
							|  |  |  |     assert du.topSqlQueries | 
					
						
							|  |  |  |     assert du.topSqlQueries.pop() == formatted_test_query | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  | def test_query_trimming(): | 
					
						
							|  |  |  |     test_email: str = "test_email@test.com" | 
					
						
							|  |  |  |     test_query: str = "select * from test where a > 10 and b > 20 order by a asc" | 
					
						
							|  |  |  |     top_n_queries: int = 10 | 
					
						
							|  |  |  |     total_budget_for_query_list: int = 200 | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-15 01:44:42 -05:00
										 |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ta.total_budget_for_query_list = total_budget_for_query_list | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     wu: MetadataWorkUnit = ta.make_usage_workunit( | 
					
						
							|  |  |  |         bucket_duration=BucketDuration.DAY, | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  |         urn_builder=_simple_urn_builder, | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |         top_n_queries=top_n_queries, | 
					
						
							| 
									
										
										
										
											2022-04-01 00:15:09 +02:00
										 |  |  |         format_sql_queries=False, | 
					
						
							| 
									
										
										
										
											2022-05-12 17:26:03 -04:00
										 |  |  |         include_top_n_queries=True, | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2022-02-01 09:11:23 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" | 
					
						
							|  |  |  |     assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) | 
					
						
							|  |  |  |     du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect | 
					
						
							|  |  |  |     assert du.totalSqlQueries == 1 | 
					
						
							|  |  |  |     assert du.topSqlQueries | 
					
						
							|  |  |  |     assert du.topSqlQueries.pop() == "select * f ..." | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_top_n_queries_validator_fails(): | 
					
						
							|  |  |  |     with pytest.raises(ValidationError) as excinfo: | 
					
						
							| 
									
										
										
										
											2022-09-12 23:12:52 +05:30
										 |  |  |         with mock.patch( | 
					
						
							|  |  |  |             "datahub.ingestion.source.usage.usage_common.GenericAggregatedDataset.total_budget_for_query_list", | 
					
						
							|  |  |  |             20, | 
					
						
							|  |  |  |         ): | 
					
						
							|  |  |  |             BaseUsageConfig(top_n_queries=2) | 
					
						
							| 
									
										
										
										
											2021-12-13 18:16:24 +01:00
										 |  |  |     assert "top_n_queries is set to 2 but it can be maximum 1" in str(excinfo.value) | 
					
						
							| 
									
										
										
										
											2022-05-12 17:26:03 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_make_usage_workunit_include_top_n_queries(): | 
					
						
							|  |  |  |     test_email = "test_email@test.com" | 
					
						
							|  |  |  |     test_query = "select * from test" | 
					
						
							|  |  |  |     event_time = datetime(2020, 1, 1) | 
					
						
							|  |  |  |     floored_ts = get_time_bucket(event_time, BucketDuration.DAY) | 
					
						
							|  |  |  |     resource = "test_db.test_schema.test_table" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) | 
					
						
							|  |  |  |     ta.add_read_entry( | 
					
						
							|  |  |  |         test_email, | 
					
						
							|  |  |  |         test_query, | 
					
						
							|  |  |  |         [], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     wu: MetadataWorkUnit = ta.make_usage_workunit( | 
					
						
							|  |  |  |         bucket_duration=BucketDuration.DAY, | 
					
						
							| 
									
										
										
										
											2022-09-10 20:36:10 -07:00
										 |  |  |         urn_builder=_simple_urn_builder, | 
					
						
							| 
									
										
										
										
											2022-05-12 17:26:03 -04:00
										 |  |  |         top_n_queries=10, | 
					
						
							|  |  |  |         format_sql_queries=False, | 
					
						
							|  |  |  |         include_top_n_queries=False, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" | 
					
						
							|  |  |  |     assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) | 
					
						
							|  |  |  |     du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect | 
					
						
							|  |  |  |     assert du.totalSqlQueries == 1 | 
					
						
							|  |  |  |     assert du.topSqlQueries is None |