diff --git a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py index d135109351a..983b40313e7 100644 --- a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py @@ -16,6 +16,7 @@ Validator for table column to match set test case import collections import traceback from abc import abstractmethod +from typing import List from metadata.data_quality.validations.base_test_handler import BaseTestValidator from metadata.generated.schema.tests.basic import ( @@ -93,5 +94,5 @@ class BaseTableColumnToMatchSetValidator(BaseTestValidator): ) @abstractmethod - def _run_results(self): + def _run_results(self) -> List[str]: raise NotImplementedError diff --git a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py index ad1718c50c9..efc09df5776 100644 --- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py @@ -13,6 +13,7 @@ Validator for table column name to match set test case """ +from typing import List from metadata.data_quality.validations.mixins.pandas_validator_mixin import ( PandasValidatorMixin, @@ -30,7 +31,7 @@ class TableColumnToMatchSetValidator( ): """Validator table column name to match set test case""" - def _run_results(self): + def _run_results(self) -> List[str]: """compute result of the test case""" names = list(self.runner[0].columns) if not names: diff --git a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py index 32743f2d441..4fc678f2341 100644 --- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py @@ -14,9 +14,10 @@ Validator for table column name to match set test case """ -from typing import Optional +from typing import List, cast from sqlalchemy import inspect +from sqlalchemy.sql.base import ColumnCollection from metadata.data_quality.validations.mixins.sqa_validator_mixin import ( SQAValidatorMixin, @@ -34,11 +35,15 @@ class TableColumnToMatchSetValidator( ): """Validator for table column name to match set test case""" - def _run_results(self) -> Optional[int]: + def _run_results(self) -> List[str]: """compute result of the test case""" names = inspect(self.runner.table).c if not names: raise ValueError( f"Column names for test case {self.test_case.name} returned None" ) + names = cast( + ColumnCollection, names + ) # satisfy type checker for names.keys() access + names = list(names.keys()) return names diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py index a2a072db53f..f635b9de7d6 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py @@ -158,10 +158,10 @@ class BigtableSource(CommonNoSQLSource, MultiDBSource): records = [{"row_key": b"row_key"}] # In order to get a "good" sample of data, we try to distribute the sampling # across multiple column families. - for cf in list(column_families.keys())[:MAX_COLUMN_FAMILIES]: + for column_family in list(column_families.keys())[:MAX_COLUMN_FAMILIES]: records.extend( self._get_records_for_column_family( - table, cf, SAMPLES_PER_COLUMN_FAMILY + table, column_family, SAMPLES_PER_COLUMN_FAMILY ) ) if len(records) >= GLOBAL_SAMPLE_SIZE: diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py index f8da387c8a5..146cc658b00 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py @@ -39,22 +39,22 @@ class Row(BaseModel): @classmethod def from_partial_row(cls, row: PartialRowData): cells = {} - for cf, cf_cells in row.cells.items(): - cells.setdefault(cf, {}) + for column_family, cf_cells in row.cells.items(): + cells.setdefault(column_family, {}) for column, cell in cf_cells.items(): - cells[cf][column] = Cell( + cells[column_family][column] = Cell( values=[Value(timestamp=c.timestamp, value=c.value) for c in cell] ) return cls(cells=cells, row_key=row.row_key) def to_record(self) -> Dict[str, bytes]: record = {} - for cf, cells in self.cells.items(): + for column_family, cells in self.cells.items(): for column, cell in cells.items(): # Since each cell can have multiple values and the API returns them in descending order # from latest to oldest, we only take the latest value. This probably does not matter since # all we care about is data types and all data stored in BigTable is of type `bytes`. - record[f"{cf}.{column.decode()}"] = cell.values[0].value + record[f"{column_family}.{column.decode()}"] = cell.values[0].value record["row_key"] = self.row_key return record