mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-25 08:58:26 +00:00
feat(ingest/bigquery): Attempt to support raw dataset pattern (#9109)
This commit is contained in:
parent
32f5dcb154
commit
6c932e8afe
@ -53,10 +53,10 @@ into
|
||||
for example, using `datahub put` command. Policies can be also removed and re-created via UI.
|
||||
- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
|
||||
This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
|
||||
qualified dataset name, i.e. `<project_name>.<dataset_name>`. If this is not the case, please
|
||||
update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part),
|
||||
or set `match_fully_qualified_names: false` in your recipe. However, note that
|
||||
setting this to `false` is deprecated and this flag will be removed entirely in a future release.
|
||||
qualified dataset name, i.e. `<project_name>.<dataset_name>`. We attempt to support the old
|
||||
pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this
|
||||
should not cause any issues. However, if you have a complex dataset pattern, we recommend you
|
||||
manually convert it to the fully qualified format to avoid any potential issues.
|
||||
|
||||
### Potential Downtime
|
||||
|
||||
|
||||
@ -299,7 +299,7 @@ class BigQueryV2Config(
|
||||
"use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
|
||||
)
|
||||
|
||||
dataset_pattern = values.get("dataset_pattern")
|
||||
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
|
||||
schema_pattern = values.get("schema_pattern")
|
||||
if (
|
||||
dataset_pattern == AllowDenyPattern.allow_all()
|
||||
@ -329,6 +329,22 @@ class BigQueryV2Config(
|
||||
"Please update `dataset_pattern` to match against fully qualified schema name `<project_id>.<dataset_name>` and set config `match_fully_qualified_names : True`."
|
||||
"The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
|
||||
)
|
||||
elif match_fully_qualified_names and dataset_pattern is not None:
|
||||
adjusted = False
|
||||
for lst in [dataset_pattern.allow, dataset_pattern.deny]:
|
||||
for i, pattern in enumerate(lst):
|
||||
if "." not in pattern:
|
||||
if pattern.startswith("^"):
|
||||
lst[i] = r"^.*\." + pattern[1:]
|
||||
else:
|
||||
lst[i] = r".*\." + pattern
|
||||
adjusted = True
|
||||
if adjusted:
|
||||
logger.warning(
|
||||
"`dataset_pattern` was adjusted to match against fully qualified schema names,"
|
||||
" of the form `<project_id>.<dataset_name>`."
|
||||
)
|
||||
|
||||
return values
|
||||
|
||||
def get_table_pattern(self, pattern: List[str]) -> str:
|
||||
|
||||
@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf():
|
||||
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
|
||||
|
||||
|
||||
def test_bigquery_dataset_pattern():
|
||||
config = BigQueryV2Config.parse_obj(
|
||||
{
|
||||
"dataset_pattern": {
|
||||
"allow": [
|
||||
"test-dataset",
|
||||
"test-project.test-dataset",
|
||||
".*test-dataset",
|
||||
],
|
||||
"deny": [
|
||||
"^test-dataset-2$",
|
||||
"project\\.second_dataset",
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
assert config.dataset_pattern.allow == [
|
||||
r".*\.test-dataset",
|
||||
r"test-project.test-dataset",
|
||||
r".*test-dataset",
|
||||
]
|
||||
assert config.dataset_pattern.deny == [
|
||||
r"^.*\.test-dataset-2$",
|
||||
r"project\.second_dataset",
|
||||
]
|
||||
|
||||
config = BigQueryV2Config.parse_obj(
|
||||
{
|
||||
"dataset_pattern": {
|
||||
"allow": [
|
||||
"test-dataset",
|
||||
"test-project.test-dataset",
|
||||
".*test-dataset",
|
||||
],
|
||||
"deny": [
|
||||
"^test-dataset-2$",
|
||||
"project\\.second_dataset",
|
||||
],
|
||||
},
|
||||
"match_fully_qualified_names": False,
|
||||
}
|
||||
)
|
||||
assert config.dataset_pattern.allow == [
|
||||
r"test-dataset",
|
||||
r"test-project.test-dataset",
|
||||
r".*test-dataset",
|
||||
]
|
||||
assert config.dataset_pattern.deny == [
|
||||
r"^test-dataset-2$",
|
||||
r"project\.second_dataset",
|
||||
]
|
||||
|
||||
|
||||
def test_bigquery_uri_with_credential():
|
||||
expected_credential_json = {
|
||||
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user