diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index b8e6a2ffcc..f94f196033 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -42,6 +42,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - `DataHubGraph.parse_sql_lineage(default_dialect=...)` → `DataHubGraph.parse_sql_lineage(override_dialect=...)` - `LineageClient.add_lineage_via_sql(default_dialect=...)` → `LineageClient.add_lineage_via_sql(override_dialect=...)` - #14059: The `acryl-datahub-gx-plugin` now requires pydantic v2, which means the effective minimum supported version of GX is 0.17.15 (from Sept 2023). +- #13601: The `use_queries_v2` flag is now enabled by default for Snowflake and BigQuery ingestion. This improves the quality of lineage and quantity of queries extracted. ### Known Issues diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml index fdab32c404..87e0a6bda7 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml @@ -1,12 +1,9 @@ source: type: snowflake config: - # This option is recommended to be used to ingest all lineage + # This option is recommended to be used to ingest all lineage on the first run. ignore_start_time_lineage: true - # This flag tells the snowflake ingestion to use the more advanced query parsing. This will become the default eventually. - use_queries_v2: true - # Coordinates account_id: "abc48144" warehouse: "COMPUTE_WH" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 29d13da550..729202c71e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -342,7 +342,7 @@ class BigQueryV2Config( ) use_queries_v2: bool = Field( - default=False, + default=True, description="If enabled, uses the new queries extractor to extract queries from bigquery.", ) include_queries: bool = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 508d506326..e344123c94 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -236,7 +236,7 @@ class SnowflakeV2Config( ) use_queries_v2: bool = Field( - default=False, + default=True, description="If enabled, uses the new queries extractor to extract queries from snowflake.", ) include_queries: bool = Field( diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index e24d545398..f4ffa02c28 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -457,7 +457,7 @@ def test_bigquery_queries_v2_ingest( # if use_queries_v2 is set. pipeline_config_dict: Dict[str, Any] = recipe( mcp_output_path=mcp_output_path, - source_config_override={"use_queries_v2": True, "include_table_lineage": False}, + source_config_override={"include_table_lineage": False}, ) run_and_get_pipeline(pipeline_config_dict) @@ -564,7 +564,6 @@ LIMIT 100 pipeline_config_dict: Dict[str, Any] = recipe( mcp_output_path=mcp_output_path, source_config_override={ - "use_queries_v2": True, "include_schema_metadata": False, "include_table_lineage": True, "include_usage_statistics": True, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 35f9028bcb..f8b13017f0 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -125,6 +125,7 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): validate_upstreams_against_patterns=False, include_operational_stats=True, incremental_lineage=False, + use_queries_v2=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), @@ -220,6 +221,7 @@ def test_snowflake_tags_as_structured_properties( password="TST_PWD", match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), + use_queries_v2=False, include_technical_schema=True, include_table_lineage=False, include_column_lineage=False, @@ -286,6 +288,7 @@ def test_snowflake_private_link_and_incremental_mcps( include_views=True, include_usage_stats=False, format_sql_queries=True, + use_queries_v2=False, incremental_lineage=False, incremental_properties=True, include_operational_stats=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 8a8250e2ab..3fa80960bc 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -59,6 +59,7 @@ def snowflake_pipeline_config(tmp_path): match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_usage_stats=False, + use_queries_v2=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc, ), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py index ffecb5f9dc..df5fba0adb 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py @@ -32,6 +32,7 @@ def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_tables=include_tables, incremental_lineage=False, + use_queries_v2=False, stateful_ingestion=StatefulStaleMetadataRemovalConfig.parse_obj( { "enabled": True, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py index 86ffdf33f5..a39a279620 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py @@ -33,6 +33,7 @@ def test_snowflake_tag_pattern(): include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, + use_queries_v2=False, extract_tags=TagOption.without_lineage, ) @@ -76,6 +77,7 @@ def test_snowflake_tag_pattern_deny(): include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, + use_queries_v2=False, extract_tags=TagOption.without_lineage, ) @@ -116,6 +118,7 @@ def test_snowflake_structured_property_pattern_deny(): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), extract_tags_as_structured_properties=True, structured_properties_template_cache_invalidation_interval=0, + use_queries_v2=False, tag_pattern=AllowDenyPattern( deny=["TEST_DB.TEST_SCHEMA.my_tag_2:my_value_2"] ),