Fixes #6891 by implementing support for VARIANT DType (#7084)

2025-12-15 01:27:09 +00:00 · 2022-08-31 19:01:00 +02:00 · 2022-08-31 19:01:00 +02:00 · 811f640a18
commit 811f640a18
parent 01309249c8
7 changed files with 54 additions and 13 deletions
--- a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json
+++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json
@ -90,7 +90,8 @@
        "GEOGRAPHY",
        "ENUM",
        "JSON",
-        "UUID"
+        "UUID",
        "VARIANT"
      ]
    },
    "constraint": {
--- a/docker/run_local_docker.sh
+++ b/docker/run_local_docker.sh
@ -24,7 +24,8 @@ echo "Prepare Docker volume for the operators"@
 cd docker/local-metadata
 echo "Starting Local Docker Containers"
-docker compose down && docker compose up --build -d
+echo "Using ingestion dependency: ${INGESTION_DEPENDENCY:-all}"
 docker compose down && docker compose build --build-arg INGESTION_DEPENDENCY="${INGESTION_DEPENDENCY:-all}" && docker compose up -d
 until curl -s -f "http://localhost:9200/_cat/indices/team_search_index"; do
  printf 'Checking if Elastic Search instance is up...\n'
--- a/ingestion/Dockerfile_local
+++ b/ingestion/Dockerfile_local
@ -31,9 +31,6 @@ ENV CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints
 # Add docker provider for the DockerOperator
 RUN pip install "apache-airflow[docker]==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}"
 RUN pip install "openmetadata-ingestion[all]"
 RUN pip uninstall openmetadata-ingestion -y
 FROM airflow as apis
 WORKDIR /openmetadata-airflow-apis
 COPY openmetadata-airflow-apis /openmetadata-airflow-apis
@ -44,7 +41,7 @@ FROM apis as ingestion
 WORKDIR /ingestion
 COPY ingestion /ingestion
-ARG INGESTION_DEPENDENCY=all
+ARG INGESTION_DEPENDENCY
 RUN pip install --upgrade ".[${INGESTION_DEPENDENCY}]"
 # Uninstalling psycopg2-binary and installing psycopg2 instead 
--- a/ingestion/src/metadata/orm_profiler/orm/converter.py
+++ b/ingestion/src/metadata/orm_profiler/orm/converter.py
@ -70,6 +70,23 @@ _TYPE_MAP = {
 SQA_RESERVED_ATTRIBUTES = ["metadata"]
 def map_types(col: Column, table_service_type):
    """returns an ORM type"""
    if col.arrayDataType:
        return _TYPE_MAP.get(col.dataType)(item_type=col.arrayDataType)
    if (
        table_service_type == databaseService.DatabaseServiceType.Snowflake
        and col.dataType == DataType.JSON
    ):
        from snowflake.sqlalchemy import VARIANT
        return VARIANT
    return _TYPE_MAP.get(col.dataType)
 def check_snowflake_case_sensitive(table_service_type, table_or_col) -> Optional[bool]:
    """Check whether column or table name are not uppercase for snowflake table.
    If so, then force quoting, If not return None to let engine backend handle the logic.
@ -97,11 +114,10 @@ def build_orm_col(idx: int, col: Column, table_service_type) -> sqlalchemy.Colum
    As this is only used for INSERT/UPDATE/DELETE,
    there is no impact for our read-only purposes.
    """
    return sqlalchemy.Column(
        name=str(col.name.__root__),
-        type_=_TYPE_MAP.get(col.dataType)
+        type_=map_types(col, table_service_type),
        if not col.arrayDataType
        else _TYPE_MAP.get(col.dataType)(item_type=col.arrayDataType),
        primary_key=not bool(idx),  # The first col seen is used as PK
        quote=check_snowflake_case_sensitive(table_service_type, col.name.__root__),
        key=str(
--- a/ingestion/src/metadata/test_suite/api/workflow.py
+++ b/ingestion/src/metadata/test_suite/api/workflow.py
@ -110,6 +110,17 @@ class TestSuiteWorkflow:
            )
            raise err
    def _filter_test_cases_for_table_entity(
        self, table_fqn: str, test_cases: List[TestCase]
    ) -> list[TestCase]:
        """Filter test cases for specific entity"""
        return [
            test_case
            for test_case in test_cases
            if test_case.entityLink.__root__.split("::")[2].replace(">", "")
            == table_fqn
        ]
    def _get_unique_table_entities(self, test_cases: List[TestCase]) -> Set:
        """from a list of test cases extract unique table entities"""
        table_fqns = [
@ -240,6 +251,7 @@ class TestSuiteWorkflow:
            entity=TestSuite,
            fqn=self.config.source.serviceName,
        )
        if test_suite:
            return [test_suite]
        return None
@ -377,7 +389,9 @@ class TestSuiteWorkflow:
        for table_fqn in unique_table_fqns:
            try:
                sqa_interface = self._create_sqa_tests_runner_interface(table_fqn)
-                for test_case in test_cases:
+                for test_case in self._filter_test_cases_for_table_entity(
                    table_fqn, test_cases
                ):
                    try:
                        data_test_runner = self._create_data_tests_runner(sqa_interface)
                        test_result = data_test_runner.run_and_handle(test_case)
--- a/ingestion/src/metadata/test_suite/validations/column/column_values_length_to_be_between.py
+++ b/ingestion/src/metadata/test_suite/validations/column/column_values_length_to_be_between.py
@ -85,6 +85,18 @@ def column_value_length_to_be_between(
            ],
        )
    if not max_value_length_value_res or not min_value_length_value_res:
        msg = f"Error computing {test_case.name} for {runner.table.__tablename__}: missing max value length or min value length"
        return TestCaseResult(
            timestamp=execution_date,
            testCaseStatus=TestCaseStatus.Aborted,
            result=msg,
            testResultValue=[
                TestResultValue(name="minValueLength", value=None),
                TestResultValue(name="maxValueLength", value=None),
            ],
        )
    min_bound = next(
        (
            float(param.value)
@ -102,8 +114,8 @@ def column_value_length_to_be_between(
    status = (
        TestCaseStatus.Success
-        if min_bound >= min_value_length_value_res
+        if min_bound <= min_value_length_value_res
-        and max_bound <= max_value_length_value_res
+        and max_bound >= max_value_length_value_res
        else TestCaseStatus.Failed
    )
    result = (
--- a/openmetadata-docs/content/openmetadata/ingestion/workflows/profiler/index.md
+++ b/openmetadata-docs/content/openmetadata/ingestion/workflows/profiler/index.md
@ -56,7 +56,7 @@ regex expression to filter tables.
 Sampling percentage to apply for profiling tables.
 **Thread Count**
-Number of thread to use when computing metrics for the profiler
+Number of thread to use when computing metrics for the profiler. For Snowflake users we recommend setting it to 1. There is a known issue with one of the dependency (`snowflake-connector-python`) affecting projects with certain environments. 
 **Ingest Sample Data**
 Whether the profiler should ingest sample data