datahub/metadata-ingestion/tests/integration/bigquery-usage/test_bigquery_usage.py

import pathlib
import unittest.mock
from datetime import datetime, timedelta, timezone

import jsonpickle
import pydantic
import pytest

from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.source.usage.bigquery_usage import (
    BigQueryTableRef,
    BigQueryUsageConfig,
    BigQueryUsageSource,
)
from tests.test_helpers import mce_helpers

WRITE_REFERENCE_FILE = False


def test_bq_usage_config():
    config = BigQueryUsageConfig.parse_obj(
        dict(
            project_id="sample-bigquery-project-name-1234",
            bucket_duration="HOUR",
            end_time="2021-07-20T00:00:00Z",
            table_pattern={"allow": ["test-regex", "test-regex-1"], "deny": []},
        )
    )
    assert config.get_allow_pattern_string() == "test-regex|test-regex-1"
    assert config.get_deny_pattern_string() == ""
    assert (config.end_time - config.start_time) == timedelta(hours=1)
    assert config.projects == ["sample-bigquery-project-name-1234"]


def test_bq_timezone_validation():
    with pytest.raises(pydantic.ValidationError, match="UTC"):
        BigQueryUsageConfig.parse_obj(
            dict(
                project_id="sample-bigquery-project-name-1234",
                start_time="2021-07-20T00:00:00",
            )
        )


def test_bq_usage_source(pytestconfig, tmp_path):
    # from google.cloud.logging_v2 import ProtobufEntry

    test_resources_dir: pathlib.Path = (
        pytestconfig.rootpath / "tests/integration/bigquery-usage"
    )
    bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json"

    if WRITE_REFERENCE_FILE:
        source = BigQueryUsageSource.create(
            dict(
                projects=[
                    "harshal-playground-306419",
                ],
                start_time=datetime.now(tz=timezone.utc) - timedelta(days=25),
            ),
            PipelineContext(run_id="bq-usage-test"),
        )
        entries = list(
            source._get_bigquery_log_entries(source._make_bigquery_clients())
        )

        entries = [entry._replace(logger=None) for entry in entries]
        log_entries = jsonpickle.encode(entries, indent=4)
        with bigquery_reference_logs_path.open("w") as logs:
            logs.write(log_entries)

    with unittest.mock.patch(
        "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True
    ) as MockClient:
        # Add mock BigQuery API responses.
        with bigquery_reference_logs_path.open() as logs:
            reference_logs = jsonpickle.decode(logs.read())
        MockClient().list_entries.return_value = reference_logs

        # Run a BigQuery usage ingestion run.
        pipeline = Pipeline.create(
            {
                "run_id": "test-bigquery-usage",
                "source": {
                    "type": "bigquery-usage",
                    "config": {
                        "projects": ["sample-bigquery-project-1234"],
                        "start_time": "2021-01-01T00:00Z",
                        "end_time": "2021-07-01T00:00Z",
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/bigquery_usages.json",
                    },
                },
            }
        )
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "bigquery_usages.json",
        golden_path=test_resources_dir / "bigquery_usages_golden.json",
    )


@pytest.mark.parametrize(
    "test_input,expected",
    [
        ("test_table$20220101", "test_table"),
        ("test_table$__PARTITIONS_SUMMARY__", "test_table"),
        ("test_table_20220101", "test_table"),
    ],
)
def test_remove_extras(test_input, expected):
    table_ref = BigQueryTableRef("test_project", "test_dataset", test_input)
    assert table_ref.remove_extras().table == expected
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`import pathlib`
			`import unittest.mock`
			`from datetime import datetime, timedelta, timezone`

			`import jsonpickle`
feat(ingest): add timezone validation to bigquery usage (#2962) 2021-07-28 14:16:31 -07:00			`import pydantic`
			`import pytest`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00
			`from datahub.ingestion.api.common import PipelineContext`
			`from datahub.ingestion.run.pipeline import Pipeline`
fix(ingest): patch lookml types and refactor ingestion sources layout (#2950) 2021-07-26 13:06:52 -07:00			`from datahub.ingestion.source.usage.bigquery_usage import (`
fix(ingest): bigquery-usage - fix remove_extras to remove all partitions (#3842) 2022-01-07 21:30:45 +01:00			`BigQueryTableRef,`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`BigQueryUsageConfig,`
			`BigQueryUsageSource,`
			`)`
			`from tests.test_helpers import mce_helpers`

			`WRITE_REFERENCE_FILE = False`


feat(ingestion): support multiple project IDs in bigquery usage stats (#2920) 2021-07-21 12:42:06 -07:00			`def test_bq_usage_config():`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`config = BigQueryUsageConfig.parse_obj(`
			`dict(`
			`project_id="sample-bigquery-project-name-1234",`
			`bucket_duration="HOUR",`
feat(ingest): add timezone validation to bigquery usage (#2962) 2021-07-28 14:16:31 -07:00			`end_time="2021-07-20T00:00:00Z",`
feat(ingest): allow logs to be filtered in Bigquery Usage plugin (#3567) 2021-11-16 11:11:17 -05:00			`table_pattern={"allow": ["test-regex", "test-regex-1"], "deny": []},`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`)`
			`)`
feat(ingest): allow logs to be filtered in Bigquery Usage plugin (#3567) 2021-11-16 11:11:17 -05:00			`assert config.get_allow_pattern_string() == "test-regex\|test-regex-1"`
			`assert config.get_deny_pattern_string() == ""`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`assert (config.end_time - config.start_time) == timedelta(hours=1)`
feat(ingestion): support multiple project IDs in bigquery usage stats (#2920) 2021-07-21 12:42:06 -07:00			`assert config.projects == ["sample-bigquery-project-name-1234"]`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00

feat(ingest): add timezone validation to bigquery usage (#2962) 2021-07-28 14:16:31 -07:00			`def test_bq_timezone_validation():`
			`with pytest.raises(pydantic.ValidationError, match="UTC"):`
			`BigQueryUsageConfig.parse_obj(`
			`dict(`
			`project_id="sample-bigquery-project-name-1234",`
			`start_time="2021-07-20T00:00:00",`
			`)`
			`)`


feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`def test_bq_usage_source(pytestconfig, tmp_path):`
			`# from google.cloud.logging_v2 import ProtobufEntry`

			`test_resources_dir: pathlib.Path = (`
			`pytestconfig.rootpath / "tests/integration/bigquery-usage"`
			`)`
			`bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json"`

			`if WRITE_REFERENCE_FILE:`
			`source = BigQueryUsageSource.create(`
			`dict(`
feat(ingestion): support multiple project IDs in bigquery usage stats (#2920) 2021-07-21 12:42:06 -07:00			`projects=[`
			`"harshal-playground-306419",`
			`],`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`start_time=datetime.now(tz=timezone.utc) - timedelta(days=25),`
			`),`
			`PipelineContext(run_id="bq-usage-test"),`
			`)`
feat(ingestion): support multiple project IDs in bigquery usage stats (#2920) 2021-07-21 12:42:06 -07:00			`entries = list(`
			`source._get_bigquery_log_entries(source._make_bigquery_clients())`
			`)`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00
			`entries = [entry._replace(logger=None) for entry in entries]`
			`log_entries = jsonpickle.encode(entries, indent=4)`
			`with bigquery_reference_logs_path.open("w") as logs:`
			`logs.write(log_entries)`

			`with unittest.mock.patch(`
fix(ingest): patch lookml types and refactor ingestion sources layout (#2950) 2021-07-26 13:06:52 -07:00			`"datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`) as MockClient:`
			`# Add mock BigQuery API responses.`
			`with bigquery_reference_logs_path.open() as logs:`
			`reference_logs = jsonpickle.decode(logs.read())`
			`MockClient().list_entries.return_value = reference_logs`

			`# Run a BigQuery usage ingestion run.`
			`pipeline = Pipeline.create(`
			`{`
			`"run_id": "test-bigquery-usage",`
			`"source": {`
			`"type": "bigquery-usage",`
feat(ingest): improve bigquery-usage robustness and docs (#2925) 2021-07-22 15:37:55 -07:00			`"config": {`
			`"projects": ["sample-bigquery-project-1234"],`
			`"start_time": "2021-01-01T00:00Z",`
			`"end_time": "2021-07-01T00:00Z",`
			`},`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`},`
			`"sink": {`
			`"type": "file",`
			`"config": {`
			`"filename": f"{tmp_path}/bigquery_usages.json",`
			`},`
			`},`
			`}`
			`)`
			`pipeline.run()`
			`pipeline.raise_from_status()`

feat(ingest): refactor mce comparison and add pytest update golden files option (#2812) 2021-06-30 16:53:20 -07:00			`mce_helpers.check_golden_file(`
			`pytestconfig,`
			`output_path=tmp_path / "bigquery_usages.json",`
			`golden_path=test_resources_dir / "bigquery_usages_golden.json",`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`)`
fix(ingest): bigquery-usage - fix remove_extras to remove all partitions (#3842) 2022-01-07 21:30:45 +01:00

			`@pytest.mark.parametrize(`
			`"test_input,expected",`
			`[`
			`("test_table$20220101", "test_table"),`
			`("test_table$__PARTITIONS_SUMMARY__", "test_table"),`
			`("test_table_20220101", "test_table"),`
			`],`
			`)`
			`def test_remove_extras(test_input, expected):`
			`table_ref = BigQueryTableRef("test_project", "test_dataset", test_input)`
			`assert table_ref.remove_extras().table == expected`