From c9b9afc5307e1ae602675d67b37823a931a137f6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 2 Apr 2024 07:29:27 -0700 Subject: [PATCH] feat(ingest/dbt): enable model performance and compiled code by default (#10164) --- docs/how/updating-datahub.md | 3 ++- .../src/datahub/ingestion/source/dbt/dbt_common.py | 8 ++------ .../src/datahub/ingestion/source/dbt/dbt_core.py | 6 +++--- metadata-ingestion/tests/integration/dbt/test_dbt.py | 9 +-------- metadata-ingestion/tests/unit/test_dbt_source.py | 2 +- 5 files changed, 9 insertions(+), 19 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 60504aaa7b..8051777a5e 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -26,7 +26,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #10055 - Assertion entities generated by dbt are now associated with the dbt dataset entity, and not the entity in the data warehouse. - #10090 - For Redshift ingestion, `use_lineage_v2` is now enabled by default. - #10147 - For looker ingestion, the browse paths for looker Dashboard, Chart, View, Explore have been updated to align with Looker UI. This does not affect URNs or lineage but primarily affects (improves) browsing experience. -- +- #10164 - For dbt ingestion, `entities_enabled.model_performance` and `include_compiled_code` are now both enabled by default. Upgrading dbt ingestion will also require upgrading the backend to 0.13.1. + ### Potential Downtime ### Deprecations diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 788a4f0b5d..4876e2b6fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -179,9 +179,7 @@ class DBTEntitiesEnabled(ConfigModel): description="Emit metadata for test results when set to Yes or Only", ) model_performance: EmitDirective = Field( - # TODO: This is currently disabled by default, but will be enabled by default once - # the models have stabilized. - EmitDirective.NO, + EmitDirective.YES, description="Emit model performance metadata when set to Yes or Only. " "Only supported with dbt core.", ) @@ -349,9 +347,7 @@ class DBTCommonConfig( _remove_use_compiled_code = pydantic_removed_field("use_compiled_code") include_compiled_code: bool = Field( - # TODO: Once the formattedViewLogic field model change is included in a server - # release, probably 0.13.1, we can flip the default to True. - default=False, + default=True, description="When enabled, includes the compiled code in the emitted metadata.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index d04fa59ecb..c885ee6525 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -53,9 +53,9 @@ class DBTCoreConfig(DBTCommonConfig): run_results_paths: List[str] = Field( default=[], description="Path to output of dbt test run as run_results files in JSON format. " - "If invoking dbt multiple times, you can provide paths to multiple run result files." - "See https://docs.getdbt.com/reference/artifacts/run-results-json. " - "If not specified, test execution results will not be populated in DataHub.", + "If not specified, test execution results and model performance metadata will not be populated in DataHub." + "If invoking dbt multiple times, you can provide paths to multiple run result files. " + "See https://docs.getdbt.com/reference/artifacts/run-results-json.", ) # Because we now also collect model performance metadata, the "test_results" field was renamed to "run_results". diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index 953ff24f7b..5f7d65f5b2 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -26,8 +26,6 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" _default_dbt_source_args = { # Needed to avoid needing to access datahub server. "write_semantics": "OVERRIDE", - # Needed until this is made the default. - "include_compiled_code": True, } @@ -216,12 +214,7 @@ class DbtTestConfig: manifest_file="sample_dbt_manifest_2.json", sources_file="sample_dbt_sources_2.json", run_results_files=["sample_dbt_run_results_2.json"], - source_config_modifiers={ - "entities_enabled": { - # TODO: Remove this once it becomes the default. - "model_performance": "YES", - }, - }, + source_config_modifiers={}, ), ], ids=lambda dbt_test_config: dbt_test_config.run_id, diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 91a4e568d8..b0db18594f 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -293,7 +293,7 @@ def test_dbt_entity_emission_configuration_helpers(): assert config.entities_enabled.can_emit_node_type("source") assert config.entities_enabled.can_emit_node_type("test") assert config.entities_enabled.can_emit_test_results - assert not config.entities_enabled.can_emit_model_performance + assert config.entities_enabled.can_emit_model_performance assert not config.entities_enabled.is_only_test_results() config_dict = {