feat(ingest/bigquery): Attempt to support raw dataset pattern (#9109)

This commit is contained in:
Andrew Sikowitz 2023-10-25 16:17:09 -04:00 committed by GitHub
parent 32f5dcb154
commit 6c932e8afe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 74 additions and 5 deletions

View File

@ -53,10 +53,10 @@ into
for example, using `datahub put` command. Policies can be also removed and re-created via UI.
- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
qualified dataset name, i.e. `<project_name>.<dataset_name>`. If this is not the case, please
update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part),
or set `match_fully_qualified_names: false` in your recipe. However, note that
setting this to `false` is deprecated and this flag will be removed entirely in a future release.
qualified dataset name, i.e. `<project_name>.<dataset_name>`. We attempt to support the old
pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this
should not cause any issues. However, if you have a complex dataset pattern, we recommend you
manually convert it to the fully qualified format to avoid any potential issues.
### Potential Downtime

View File

@ -299,7 +299,7 @@ class BigQueryV2Config(
"use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
)
dataset_pattern = values.get("dataset_pattern")
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
schema_pattern = values.get("schema_pattern")
if (
dataset_pattern == AllowDenyPattern.allow_all()
@ -329,6 +329,22 @@ class BigQueryV2Config(
"Please update `dataset_pattern` to match against fully qualified schema name `<project_id>.<dataset_name>` and set config `match_fully_qualified_names : True`."
"The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
)
elif match_fully_qualified_names and dataset_pattern is not None:
adjusted = False
for lst in [dataset_pattern.allow, dataset_pattern.deny]:
for i, pattern in enumerate(lst):
if "." not in pattern:
if pattern.startswith("^"):
lst[i] = r"^.*\." + pattern[1:]
else:
lst[i] = r".*\." + pattern
adjusted = True
if adjusted:
logger.warning(
"`dataset_pattern` was adjusted to match against fully qualified schema names,"
" of the form `<project_id>.<dataset_name>`."
)
return values
def get_table_pattern(self, pattern: List[str]) -> str:

View File

@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf():
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
def test_bigquery_dataset_pattern():
config = BigQueryV2Config.parse_obj(
{
"dataset_pattern": {
"allow": [
"test-dataset",
"test-project.test-dataset",
".*test-dataset",
],
"deny": [
"^test-dataset-2$",
"project\\.second_dataset",
],
},
}
)
assert config.dataset_pattern.allow == [
r".*\.test-dataset",
r"test-project.test-dataset",
r".*test-dataset",
]
assert config.dataset_pattern.deny == [
r"^.*\.test-dataset-2$",
r"project\.second_dataset",
]
config = BigQueryV2Config.parse_obj(
{
"dataset_pattern": {
"allow": [
"test-dataset",
"test-project.test-dataset",
".*test-dataset",
],
"deny": [
"^test-dataset-2$",
"project\\.second_dataset",
],
},
"match_fully_qualified_names": False,
}
)
assert config.dataset_pattern.allow == [
r"test-dataset",
r"test-project.test-dataset",
r".*test-dataset",
]
assert config.dataset_pattern.deny == [
r"^test-dataset-2$",
r"project\.second_dataset",
]
def test_bigquery_uri_with_credential():
expected_credential_json = {
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",