MINOR - Fix column to match set test (#15186)

* fix: column value test for SQA types * style: ran python linting
2025-12-20 12:09:05 +00:00 · 2024-02-23 16:35:58 +01:00 · 2024-02-23 16:35:58 +01:00 · ba8208222e
commit ba8208222e
parent bdf27458e5
5 changed files with 18 additions and 11 deletions
--- a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py
@ -16,6 +16,7 @@ Validator for table column to match set test case
 import collections
 import traceback
 from abc import abstractmethod
+from typing import List

 from metadata.data_quality.validations.base_test_handler import BaseTestValidator
 from metadata.generated.schema.tests.basic import (
@ -93,5 +94,5 @@ class BaseTableColumnToMatchSetValidator(BaseTestValidator):
        )

    @abstractmethod
-    def _run_results(self):
+    def _run_results(self) -> List[str]:
        raise NotImplementedError
--- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
@ -13,6 +13,7 @@
 Validator for table column name to match set test case
 """

+from typing import List

 from metadata.data_quality.validations.mixins.pandas_validator_mixin import (
    PandasValidatorMixin,
@ -30,7 +31,7 @@ class TableColumnToMatchSetValidator(
 ):
    """Validator table column name to match set test case"""

-    def _run_results(self):
+    def _run_results(self) -> List[str]:
        """compute result of the test case"""
        names = list(self.runner[0].columns)
        if not names:
--- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
@ -14,9 +14,10 @@ Validator for table column name to match set test case
 """


-from typing import Optional
+from typing import List, cast

 from sqlalchemy import inspect
+from sqlalchemy.sql.base import ColumnCollection

 from metadata.data_quality.validations.mixins.sqa_validator_mixin import (
    SQAValidatorMixin,
@ -34,11 +35,15 @@ class TableColumnToMatchSetValidator(
 ):
    """Validator for table column name to match set test case"""

-    def _run_results(self) -> Optional[int]:
+    def _run_results(self) -> List[str]:
        """compute result of the test case"""
        names = inspect(self.runner.table).c
        if not names:
            raise ValueError(
                f"Column names for test case {self.test_case.name} returned None"
            )
+        names = cast(
+            ColumnCollection, names
+        )  # satisfy type checker for names.keys() access
+        names = list(names.keys())
        return names
--- a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py
@ -158,10 +158,10 @@ class BigtableSource(CommonNoSQLSource, MultiDBSource):
            records = [{"row_key": b"row_key"}]
            # In order to get a "good" sample of data, we try to distribute the sampling
            # across multiple column families.
-            for cf in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
+            for column_family in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
                records.extend(
                    self._get_records_for_column_family(
-                        table, cf, SAMPLES_PER_COLUMN_FAMILY
+                        table, column_family, SAMPLES_PER_COLUMN_FAMILY
                    )
                )
                if len(records) >= GLOBAL_SAMPLE_SIZE:
--- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py
@ -39,22 +39,22 @@ class Row(BaseModel):
    @classmethod
    def from_partial_row(cls, row: PartialRowData):
        cells = {}
-        for cf, cf_cells in row.cells.items():
-            cells.setdefault(cf, {})
+        for column_family, cf_cells in row.cells.items():
+            cells.setdefault(column_family, {})
            for column, cell in cf_cells.items():
-                cells[cf][column] = Cell(
+                cells[column_family][column] = Cell(
                    values=[Value(timestamp=c.timestamp, value=c.value) for c in cell]
                )
        return cls(cells=cells, row_key=row.row_key)

    def to_record(self) -> Dict[str, bytes]:
        record = {}
-        for cf, cells in self.cells.items():
+        for column_family, cells in self.cells.items():
            for column, cell in cells.items():
                # Since each cell can have multiple values and the API returns them in descending order
                # from latest to oldest, we only take the latest value. This probably does not matter since
                # all we care about is data types and all data stored in BigTable is of type `bytes`.
-                record[f"{cf}.{column.decode()}"] = cell.values[0].value
+                record[f"{column_family}.{column.decode()}"] = cell.values[0].value
        record["row_key"] = self.row_key

        return record