import pathlib import unittest.mock from datetime import datetime, timedelta, timezone import jsonpickle import pydantic import pytest from freezegun import freeze_time from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.usage.bigquery_usage import ( BigQueryTableRef, BigQueryUsageConfig, BigQueryUsageSource, ) from tests.test_helpers import mce_helpers WRITE_REFERENCE_FILE = False FROZEN_TIME = "2021-07-20 00:00:00" @freeze_time(FROZEN_TIME) def test_bq_usage_config(): config = BigQueryUsageConfig.parse_obj( dict( project_id="sample-bigquery-project-name-1234", bucket_duration="HOUR", end_time="2021-07-20T00:00:00Z", table_pattern={"allow": ["test-regex", "test-regex-1"], "deny": []}, ) ) assert config.get_allow_pattern_string() == "test-regex|test-regex-1" assert config.get_deny_pattern_string() == "" assert (config.end_time - config.start_time) == timedelta(hours=1) assert config.projects == ["sample-bigquery-project-name-1234"] @freeze_time(FROZEN_TIME) def test_bq_timezone_validation(): with pytest.raises(pydantic.ValidationError, match="UTC"): BigQueryUsageConfig.parse_obj( dict( project_id="sample-bigquery-project-name-1234", start_time="2021-07-20T00:00:00", ) ) @freeze_time(FROZEN_TIME) def test_bq_usage_source(pytestconfig, tmp_path): # from google.cloud.logging_v2 import ProtobufEntry test_resources_dir: pathlib.Path = ( pytestconfig.rootpath / "tests/integration/bigquery-usage" ) bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json" if WRITE_REFERENCE_FILE: source = BigQueryUsageSource.create( dict( projects=[ "harshal-playground-306419", ], start_time=datetime.now(tz=timezone.utc) - timedelta(days=25), ), PipelineContext(run_id="bq-usage-test"), ) entries = list( source._get_bigquery_log_entries_via_gcp_logging( source._make_bigquery_logging_clients() ) ) entries = [entry._replace(logger=None) for entry in entries] log_entries = jsonpickle.encode(entries, indent=4) with bigquery_reference_logs_path.open("w") as logs: logs.write(log_entries) with unittest.mock.patch( "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True ) as MockClient: # Add mock BigQuery API responses. with bigquery_reference_logs_path.open() as logs: reference_logs = jsonpickle.decode(logs.read()) MockClient().list_entries.return_value = reference_logs # Run a BigQuery usage ingestion run. pipeline = Pipeline.create( { "run_id": "test-bigquery-usage", "source": { "type": "bigquery-usage-legacy", "config": { "projects": ["sample-bigquery-project-1234"], "start_time": "2021-01-01T00:00Z", "end_time": "2021-07-01T00:00Z", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/bigquery_usages.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "bigquery_usages.json", golden_path=test_resources_dir / "bigquery_usages_golden.json", ) @freeze_time(FROZEN_TIME) def test_bq_usage_source_with_read_events(pytestconfig, tmp_path): # from google.cloud.logging_v2 import ProtobufEntry test_resources_dir: pathlib.Path = ( pytestconfig.rootpath / "tests/integration/bigquery-usage" ) bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json" if WRITE_REFERENCE_FILE: source = BigQueryUsageSource.create( dict( projects=[ "harshal-playground-306419", ], start_time=datetime.now(tz=timezone.utc) - timedelta(days=25), ), PipelineContext(run_id="bq-usage-test"), ) entries = list( source._get_bigquery_log_entries_via_gcp_logging( source._make_bigquery_logging_clients() ) ) entries = [entry._replace(logger=None) for entry in entries] log_entries = jsonpickle.encode(entries, indent=4) with bigquery_reference_logs_path.open("w") as logs: logs.write(log_entries) with unittest.mock.patch( "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True ) as MockClient: # Add mock BigQuery API responses. with bigquery_reference_logs_path.open() as logs: reference_logs = jsonpickle.decode(logs.read()) MockClient().list_entries.return_value = reference_logs # Run a BigQuery usage ingestion run. pipeline = Pipeline.create( { "run_id": "test-bigquery-usage", "source": { "type": "bigquery-usage-legacy", "config": { "projects": ["sample-bigquery-project-1234"], "start_time": "2021-01-01T00:00Z", "end_time": "2021-07-01T00:00Z", "include_read_operational_stats": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/bigquery_usages_with_read_events.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "bigquery_usages_with_read_events.json", golden_path=test_resources_dir / "bigquery_usages_with_read_events_golden.json", ) @pytest.mark.parametrize( "test_input,expected", [ ("test_table$20220101", "test_table"), ("test_table$__PARTITIONS_SUMMARY__", "test_table"), ("test_table_20220101", "test_table"), ("20220101", "test_dataset"), ("test_table", "test_table"), ], ) def test_remove_extras(test_input, expected): config = BigQueryUsageConfig.parse_obj( dict( project_id="sample-bigquery-project-name-1234", ) ) table_ref = BigQueryTableRef("test_project", "test_dataset", test_input) assert table_ref.remove_extras(config.sharded_table_pattern).table == expected