feat: add dbt freshness check test (#18730)

* add dbt freshness check * docs * run linting * add test case param definition * fix test case param definition * add config for dbt http, fix linting * refactor (only create freshness test definition when user executed one) * fix dbt files class * fix dbt files class 2 * fix dbt objects class * fix linting * fix pylint * fix linting once and for all --------- Co-authored-by: Teddy <teddy.crepineau@gmail.com>
2025-10-05 22:06:41 +00:00 · 2024-11-28 18:30:11 +01:00 · 2024-11-28 18:30:11 +01:00 · da176767a8
commit da176767a8
parent 6410583018
9 changed files with 143 additions and 3 deletions
--- a/ingestion/src/metadata/ingestion/source/database/dbt/constants.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/constants.py
@ -82,6 +82,7 @@ NONE_KEYWORDS_LIST = ["none", "null"]
 DBT_CATALOG_FILE_NAME = "catalog.json"
 DBT_MANIFEST_FILE_NAME = "manifest.json"
 DBT_RUN_RESULTS_FILE_NAME = "run_results"
 DBT_SOURCES_FILE_NAME = "sources.json"
 class SkipResourceTypeEnum(Enum):
@ -91,6 +92,7 @@ class SkipResourceTypeEnum(Enum):
    ANALYSIS = "analysis"
    TEST = "test"
    SOURCE = "source"
 class CompiledQueriesEnum(Enum):
@ -127,6 +129,7 @@ class DbtTestFailureEnum(Enum):
    FAILURE = "failure"
    FAIL = "fail"
    ERROR = "error"
 class DbtCommonEnum(Enum):
@ -137,6 +140,7 @@ class DbtCommonEnum(Enum):
    OWNER = "owner"
    NODES = "nodes"
    SOURCES = "sources"
    SOURCES_FILE = "sources_file"
    SOURCE = "source"
    RESOURCETYPE = "resource_type"
    MANIFEST_NODE = "manifest_node"
--- a/ingestion/src/metadata/ingestion/source/database/dbt/dbt_config.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/dbt_config.py
@ -43,6 +43,7 @@ from metadata.ingestion.source.database.dbt.constants import (
    DBT_CATALOG_FILE_NAME,
    DBT_MANIFEST_FILE_NAME,
    DBT_RUN_RESULTS_FILE_NAME,
    DBT_SOURCES_FILE_NAME,
 )
 from metadata.ingestion.source.database.dbt.models import DbtFiles
 from metadata.readers.file.config_source_factory import get_reader
@ -85,6 +86,7 @@ def _(config: DbtLocalConfig):
            config.dbtManifestFilePath,
            config.dbtCatalogFilePath,
            config.dbtRunResultsFilePath,
            config.dbtSourcesFilePath,
        ]
        yield from download_dbt_files(
            blob_grouped_by_directory=blob_grouped_by_directory,
@ -123,12 +125,22 @@ def _(config: DbtHttpConfig):
            dbt_catalog = requests.get(  # pylint: disable=missing-timeout
                config.dbtCatalogHttpPath
            )
        dbt_sources = None
        if config.dbtSourcesHttpPath:
            logger.debug(
                f"Requesting [dbtSourcesHttpPath] to: {config.dbtSourcesHttpPath}"
            )
            dbt_sources = requests.get(  # pylint: disable=missing-timeout
                config.dbtSourcesHttpPath
            )
        if not dbt_manifest:
            raise DBTConfigException("Manifest file not found in file server")
        yield DbtFiles(
            dbt_catalog=dbt_catalog.json() if dbt_catalog else None,
            dbt_manifest=dbt_manifest.json(),
            dbt_run_results=[dbt_run_results.json()] if dbt_run_results else None,
            dbt_sources=dbt_sources.json() if dbt_sources else None,
        )
    except DBTConfigException as exc:
        raise exc
@ -243,6 +255,7 @@ def get_blobs_grouped_by_dir(blobs: List[str]) -> Dict[str, List[str]]:
    return blob_grouped_by_directory
 # pylint: disable=too-many-locals, too-many-branches
 def download_dbt_files(
    blob_grouped_by_directory: Dict, config, client, bucket_name: Optional[str]
 ) -> Iterable[DbtFiles]:
@ -255,6 +268,7 @@ def download_dbt_files(
    ) in blob_grouped_by_directory.items():
        dbt_catalog = None
        dbt_manifest = None
        dbt_sources = None
        dbt_run_results = []
        kwargs = {}
        if bucket_name:
@ -285,12 +299,16 @@ def download_dbt_files(
                            logger.warning(
                                f"{DBT_RUN_RESULTS_FILE_NAME} not found in {key}: {exc}"
                            )
                    if DBT_SOURCES_FILE_NAME == blob_file_name.lower():
                        logger.debug(f"{DBT_SOURCES_FILE_NAME} found in {key}")
                        dbt_sources = reader.read(path=blob, **kwargs)
            if not dbt_manifest:
                raise DBTConfigException(f"Manifest file not found at: {key}")
            yield DbtFiles(
                dbt_catalog=json.loads(dbt_catalog) if dbt_catalog else None,
                dbt_manifest=json.loads(dbt_manifest),
                dbt_run_results=dbt_run_results if dbt_run_results else None,
                dbt_sources=json.loads(dbt_sources) if dbt_sources else None,
            )
        except DBTConfigException as exc:
            logger.warning(exc)
--- a/ingestion/src/metadata/ingestion/source/database/dbt/dbt_service.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/dbt_service.py
@ -15,7 +15,12 @@ DBT service Topology.
 from abc import ABC, abstractmethod
 from typing import Iterable, List
-from dbt_artifacts_parser.parser import parse_catalog, parse_manifest, parse_run_results
+from dbt_artifacts_parser.parser import (
    parse_catalog,
    parse_manifest,
    parse_run_results,
    parse_sources,
 )
 from pydantic import Field
 from typing_extensions import Annotated
@ -209,11 +214,13 @@ class DbtServiceSource(TopologyRunnerMixin, Source, ABC):
            self.remove_run_result_non_required_keys(
                run_results=self.context.get().dbt_file.dbt_run_results
            )
        dbt_objects = DbtObjects(
            dbt_catalog=parse_catalog(self.context.get().dbt_file.dbt_catalog)
            if self.context.get().dbt_file.dbt_catalog
            else None,
            dbt_manifest=parse_manifest(self.context.get().dbt_file.dbt_manifest),
            dbt_sources=parse_sources(self.context.get().dbt_file.dbt_sources),
            dbt_run_results=[
                parse_run_results(run_result_file)
                for run_result_file in self.context.get().dbt_file.dbt_run_results
--- a/ingestion/src/metadata/ingestion/source/database/dbt/dbt_utils.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/dbt_utils.py
@ -44,6 +44,20 @@ def create_test_case_parameter_definitions(dbt_test):
                }
            ]
            return test_case_param_definition
        if hasattr(dbt_test, "freshness"):
            test_case_param_definition = [
                {
                    "name": "warn_after",
                    "displayName": "warn_after",
                    "required": False,
                },
                {
                    "name": "error_after",
                    "displayName": "error_after",
                    "required": False,
                },
            ]
            return test_case_param_definition
    except Exception as err:  # pylint: disable=broad-except
        logger.debug(traceback.format_exc())
        logger.error(
@ -67,6 +81,21 @@ def create_test_case_parameter_values(dbt_test):
                {"name": manifest_node.test_metadata.name, "value": dbt_test_values}
            ]
            return test_case_param_values
        if hasattr(manifest_node, "freshness"):
            warn_after = manifest_node.freshness.warn_after
            error_after = manifest_node.freshness.error_after
            test_case_param_values = [
                {
                    "name": "error_after",
                    "value": f"{error_after.count} {error_after.period.value}",
                },
                {
                    "name": "warn_after",
                    "value": f"{warn_after.count} {warn_after.period.value}",
                },
            ]
            return test_case_param_values
    except Exception as err:  # pylint: disable=broad-except
        logger.debug(traceback.format_exc())
        logger.error(
--- a/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
@ -13,6 +13,7 @@
 DBT source methods.
 """
 import traceback
 from copy import deepcopy
 from datetime import datetime
 from typing import Any, Iterable, List, Optional, Union
@ -324,7 +325,41 @@ class DbtSource(DbtServiceSource):
            None,
        )
-    # pylint: disable=too-many-locals, too-many-branches
+    def _add_dbt_freshness_test_from_sources(
        self, key: str, manifest_node, manifest_entities, dbt_objects: DbtObjects
    ):
        # in dbt manifest sources node name is table/view name (not test name like with test nodes)
        # so in order for the test creation to be named precisely I am amending manifest node name within it's deepcopy
        manifest_node_new = deepcopy(manifest_node)
        manifest_node_new.name = manifest_node_new.name + "_freshness"
        freshness_test_result = next(
            (item for item in dbt_objects.dbt_sources.results if item.unique_id == key),
            None,
        )
        if freshness_test_result:
            self.context.get().dbt_tests[key + "_freshness"] = {
                DbtCommonEnum.MANIFEST_NODE.value: manifest_node_new
            }
            self.context.get().dbt_tests[key + "_freshness"][
                DbtCommonEnum.UPSTREAM.value
            ] = self.parse_upstream_nodes(manifest_entities, manifest_node)
            self.context.get().dbt_tests[key + "_freshness"][
                DbtCommonEnum.RESULTS.value
            ] = freshness_test_result
    def add_dbt_sources(
        self, key: str, manifest_node, manifest_entities, dbt_objects: DbtObjects
    ) -> None:
        """
        Method to append dbt test cases based on sources file for later processing
        """
        self._add_dbt_freshness_test_from_sources(
            key, manifest_node, manifest_entities, dbt_objects
        )
    # pylint: disable=too-many-locals, too-many-branches, too-many-statements
    def yield_data_models(
        self, dbt_objects: DbtObjects
    ) -> Iterable[Either[DataModelLink]]:
@ -376,6 +411,17 @@ class DbtSource(DbtServiceSource):
                        )
                        continue
                    if (
                        dbt_objects.dbt_sources
                        and resource_type == SkipResourceTypeEnum.SOURCE.value
                    ):
                        self.add_dbt_sources(
                            key,
                            manifest_node=manifest_node,
                            manifest_entities=manifest_entities,
                            dbt_objects=dbt_objects,
                        )
                    # Skip the ephemeral nodes since it is not materialized
                    if check_ephemeral_node(manifest_node):
                        logger.debug(f"Skipping ephemeral DBT node: {key}.")
@ -549,6 +595,29 @@ class DbtSource(DbtServiceSource):
                        f"Failed to parse the DBT node {node} to get upstream nodes: {exc}"
                    )
                    continue
        if dbt_node.resource_type == SkipResourceTypeEnum.SOURCE.value:
            parent_fqn = fqn.build(
                self.metadata,
                entity_type=Table,
                service_name="*",
                database_name=get_corrected_name(dbt_node.database),
                schema_name=get_corrected_name(dbt_node.schema_),
                table_name=dbt_node.name,
            )
            # check if the parent table exists in OM before adding it to the upstream list
            parent_table_entity: Optional[
                Union[Table, List[Table]]
            ] = get_entity_from_es_result(
                entity_list=self.metadata.es_search_from_fqn(
                    entity_type=Table, fqn_search_string=parent_fqn
                ),
                fetch_multiple_entities=False,
            )
            if parent_table_entity:
                upstream_nodes.append(parent_fqn)
        return upstream_nodes
    def parse_data_model_columns(
--- a/ingestion/src/metadata/ingestion/source/database/dbt/models.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/models.py
@ -20,12 +20,14 @@ from pydantic import BaseModel
 class DbtFiles(BaseModel):
    dbt_catalog: Optional[dict] = None
    dbt_manifest: dict
    dbt_sources: Optional[dict] = None
    dbt_run_results: Optional[List[dict]] = None
 class DbtObjects(BaseModel):
    dbt_catalog: Optional[Any] = None
    dbt_manifest: Any
    dbt_sources: Optional[Any] = None
    dbt_run_results: Optional[List[Any]] = None
--- a/ingestion/tests/unit/test_dbt.py
+++ b/ingestion/tests/unit/test_dbt.py
@ -51,6 +51,7 @@ mock_dbt_config = {
                    "dbtCatalogFilePath": "sample/dbt_files/catalog.json",
                    "dbtManifestFilePath": "sample/dbt_files/manifest.json",
                    "dbtRunResultsFilePath": "sample/dbt_files/run_results.json",
                    "dbtSourcesFilePath": "sample/dbt_files/sources.json",
                },
            }
        },
@ -682,7 +683,7 @@ class DbtUnitTest(TestCase):
            self.assertEqual(expected, original)
    @patch("metadata.ingestion.ometa.mixins.es_mixin.ESMixin.es_search_from_fqn")
-    def test_updtream_nodes_for_lineage(self, es_search_from_fqn):
+    def test_upstream_nodes_for_lineage(self, es_search_from_fqn):
        expected_upstream_nodes = [
            "model.jaffle_shop.stg_customers",
            "model.jaffle_shop.stg_orders",
--- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/dbtconfig/dbtHttpConfig.json
+++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/dbtconfig/dbtHttpConfig.json
@ -26,6 +26,11 @@
      "title": "DBT Run Results HTTP File Path",
      "description": "DBT run results http file path to extract the test results information.",
      "type": "string"
    },
    "dbtSourcesHttpPath": {
      "title": "DBT Sources HTTP File Path",
      "description": "DBT sources http file path to extract freshness test results information.",
      "type": "string"
    }
  },
  "additionalProperties": false,
--- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/dbtconfig/dbtLocalConfig.json
+++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/dbtconfig/dbtLocalConfig.json
@ -26,6 +26,11 @@
      "title": "DBT Run Results File Path",
      "description": "DBT run results file path to extract the test results information.",
      "type": "string"
    },
    "dbtSourcesFilePath": {
      "title": "DBT Sources File Path",
      "description": "DBT sources file path to extract the freshness test result.",
      "type": "string"
    }
  },
  "additionalProperties": false,