chore(ingest): bump and pin mypy (#6584)

2025-10-06 06:26:25 +00:00 · 2022-12-02 13:53:28 -05:00 · 2022-12-02 13:53:28 -05:00 · 44cfd21a65
commit 44cfd21a65
parent 1689212434
26 changed files with 64 additions and 46 deletions
--- a/metadata-ingestion/setup.cfg
+++ b/metadata-ingestion/setup.cfg
@ -27,6 +27,7 @@ plugins =
 exclude = ^(venv|build|dist)/
 ignore_missing_imports = yes
 namespace_packages = no
+implicit_optional = no
 strict_optional = yes
 check_untyped_defs = yes
 disallow_incomplete_defs = yes
@ -38,8 +39,16 @@ disallow_untyped_defs = no
 # try to be a bit more strict in certain areas of the codebase
 [mypy-datahub.*]
 ignore_missing_imports = no
+[mypy-datahub_provider.*]
+ignore_missing_imports = no
 [mypy-tests.*]
 ignore_missing_imports = no
+[mypy-google.protobuf.*]
+# mypy sometimes ignores the above ignore_missing_imports = yes
+# See https://github.com/python/mypy/issues/10632 and
+# https://github.com/python/mypy/issues/10619#issuecomment-1174208395
+# for a discussion of why this happens.
+ignore_missing_imports = yes
 [mypy-datahub.configuration.*]
 disallow_untyped_defs = yes
 [mypy-datahub.emitter.*]
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -385,8 +385,7 @@ mypy_stubs = {
    "types-ujson>=5.2.0",
    "types-termcolor>=1.0.0",
    "types-Deprecated",
-    # Mypy complains with 4.21.0.0 => error: Library stubs not installed for "google.protobuf.descriptor"
-    "types-protobuf<4.21.0.0",
+    "types-protobuf>=4.21.0.1",
 }

 base_dev_requirements = {
@ -399,10 +398,7 @@ base_dev_requirements = {
    "flake8>=3.8.3",
    "flake8-tidy-imports>=4.3.0",
    "isort>=5.7.0",
-    # mypy 0.990 enables namespace packages by default and sets
-    # no implicit optional to True.
-    # FIXME: Enable mypy 0.990 when our codebase is fixed.
-    "mypy>=0.981,<0.990",
+    "mypy==0.991",
    # pydantic 1.8.2 is incompatible with mypy 0.910.
    # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
    # Restricting top version to <1.10 until we can fix our types.
--- a/metadata-ingestion/src/datahub/api/graphql/operation.py
+++ b/metadata-ingestion/src/datahub/api/graphql/operation.py
@ -122,8 +122,6 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
                        "operationType": operation_type,
                        "partition": partition,
                    }
-                    if filter
-                    else None
                ),
            },
        )
--- a/metadata-ingestion/src/datahub/configuration/common.py
+++ b/metadata-ingestion/src/datahub/configuration/common.py
@ -80,7 +80,7 @@ class OperationalError(PipelineExecutionError):
    message: str
    info: dict

-    def __init__(self, message: str, info: dict = None):
+    def __init__(self, message: str, info: Optional[dict] = None):
        self.message = message
        self.info = info or {}

--- a/metadata-ingestion/src/datahub/ingestion/api/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source.py
@ -120,7 +120,12 @@ class Source(Closeable, metaclass=ABCMeta):

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
-        pass
+        # Technically, this method should be abstract. However, the @config_class
+        # decorator automatically generates a create method at runtime if one is
+        # not defined. Python still treats the class as abstract because it thinks
+        # the create method is missing. To avoid the class becoming abstract, we
+        # can't make this method abstract.
+        raise NotImplementedError('sources must implement "create"')

    @abstractmethod
    def get_workunits(self) -> Iterable[WorkUnit]:
--- a/metadata-ingestion/src/datahub/ingestion/api/workunit.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/workunit.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Iterable, Union, overload
+from typing import Iterable, Optional, Union, overload

 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.source import WorkUnit
@ -42,9 +42,9 @@ class MetadataWorkUnit(WorkUnit):
    def __init__(
        self,
        id: str,
-        mce: MetadataChangeEvent = None,
-        mcp: MetadataChangeProposalWrapper = None,
-        mcp_raw: MetadataChangeProposal = None,
+        mce: Optional[MetadataChangeEvent] = None,
+        mcp: Optional[MetadataChangeProposalWrapper] = None,
+        mcp_raw: Optional[MetadataChangeProposal] = None,
        treat_errors_as_warnings: bool = False,
    ):
        super().__init__(id)
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@ -2,7 +2,7 @@ import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, cast

 from google.cloud import bigquery
 from google.cloud.bigquery.table import RowIterator, TableListItem, TimePartitioning
@ -280,6 +280,8 @@ class BigQueryDataDictionary:
    def get_datasets_for_project_id(
        conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None
    ) -> List[BigqueryDataset]:
+        # FIXME: Due to a bug in BigQuery's type annotations, we need to cast here.
+        maxResults = cast(int, maxResults)
        datasets = conn.list_datasets(project_id, max_results=maxResults)

        return [BigqueryDataset(name=d.dataset_id) for d in datasets]
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -833,8 +833,8 @@ class DatahubGEProfiler:
        self,
        query_combiner: SQLAlchemyQueryCombiner,
        pretty_name: str,
-        schema: str = None,
-        table: str = None,
+        schema: Optional[str] = None,
+        table: Optional[str] = None,
        partition: Optional[str] = None,
        custom_sql: Optional[str] = None,
        platform: Optional[str] = None,
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
@ -123,7 +123,9 @@ def remove_prefix(text: str, prefix: str) -> str:
    return text


-def unquote(string: str, leading_quote: str = '"', trailing_quote: str = None) -> str:
+def unquote(
+    string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
+) -> str:
    """
    If string starts and ends with a quote, unquote it
    """
--- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@ -2,7 +2,7 @@ import json
 import re
 import time
 import warnings
-from typing import Any, Dict, Generator, List, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple

 import requests
 import yaml
@ -47,7 +47,10 @@ def flatten2list(d: dict) -> list:


 def request_call(
-    url: str, token: str = None, username: str = None, password: str = None
+    url: str,
+    token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
 ) -> requests.Response:

    headers = {"accept": "application/json"}
@ -66,9 +69,9 @@ def request_call(

 def get_swag_json(
    url: str,
-    token: str = None,
-    username: str = None,
-    password: str = None,
+    token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
    swagger_file: str = "",
 ) -> Dict:
    tot_url = url + swagger_file
--- a/metadata-ingestion/src/datahub/ingestion/source/redash.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py
@ -402,7 +402,7 @@ class RedashSource(Source):

        return sql_table_names

-    def _get_chart_data_source(self, data_source_id: int = None) -> Dict:
+    def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
        url = f"/api/data_sources/{data_source_id}"
        resp = self.client._get(url).json()
        logger.debug(resp)
--- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
@ -735,7 +735,7 @@ class SalesforceSource(Source):
        return self.report


-def get_tags(params: List[str] = None) -> GlobalTagsClass:
+def get_tags(params: Optional[List[str]] = None) -> GlobalTagsClass:
    if params is None:
        params = []
    tags = [TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in params if tag]
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@ -100,7 +100,7 @@ class SnowflakeV2Config(SnowflakeConfig, SnowflakeUsageConfig):

    def get_sql_alchemy_url(
        self,
-        database: str = None,
+        database: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[SecretStr] = None,
        role: Optional[str] = None,
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
@ -103,7 +103,9 @@ class OracleInspectorObjectWrapper:
            for row in cursor
        ]

-    def get_table_names(self, schema: str = None, order_by: str = None) -> List[str]:
+    def get_table_names(
+        self, schema: Optional[str] = None, order_by: Optional[str] = None
+    ) -> List[str]:
        """
        skip order_by, we are not using order_by
        """
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py
@ -1,4 +1,5 @@
 from textwrap import dedent
+from typing import Optional

 from pydantic.fields import Field
 from pyhive.sqlalchemy_presto import PrestoDialect
@ -60,7 +61,7 @@ def get_view_definition(self, connection, view_name, schema=None, **kw):


 def _get_full_table(  # type: ignore
-    self, table_name: str, schema: str = None, quote: bool = True
+    self, table_name: str, schema: Optional[str] = None, quote: bool = True
 ) -> str:
    table_part = (
        self.identifier_preparer.quote_identifier(table_name) if quote else table_name
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py
@ -691,7 +691,7 @@ class RedshiftSource(SQLAlchemySource):

        return sources

-    def get_db_name(self, inspector: Inspector = None) -> str:
+    def get_db_name(self, inspector: Optional[Inspector] = None) -> str:
        db_name = getattr(self.config, "database")
        db_alias = getattr(self.config, "database_alias")
        if db_alias:
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@ -419,8 +419,8 @@ def get_schema_metadata(
    dataset_name: str,
    platform: str,
    columns: List[dict],
-    pk_constraints: dict = None,
-    foreign_keys: List[ForeignKeyConstraint] = None,
+    pk_constraints: Optional[dict] = None,
+    foreign_keys: Optional[List[ForeignKeyConstraint]] = None,
    canonical_schema: List[SchemaField] = [],
 ) -> SchemaMetadata:
    schema_metadata = SchemaMetadata(
@ -985,7 +985,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
        self,
        dataset_name: str,
        columns: List[dict],
-        pk_constraints: dict = None,
+        pk_constraints: Optional[dict] = None,
        tags: Optional[Dict[str, List[str]]] = None,
    ) -> List[SchemaField]:
        canonical_schema = []
@ -1003,7 +1003,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
        self,
        dataset_name: str,
        column: dict,
-        pk_constraints: dict = None,
+        pk_constraints: Optional[dict] = None,
        tags: Optional[List[str]] = None,
    ) -> List[SchemaField]:
        gtc: Optional[GlobalTagsClass] = None
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py
@ -194,7 +194,7 @@ class TrinoSource(SQLAlchemySource):
        self,
        dataset_name: str,
        column: dict,
-        pk_constraints: dict = None,
+        pk_constraints: Optional[dict] = None,
        tags: Optional[List[str]] = None,
    ) -> List[SchemaField]:

--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@ -992,7 +992,10 @@ class TableauSource(StatefulIngestionSourceBase):
        return mcp_workunit

    def emit_datasource(
-        self, datasource: dict, workbook: dict = None, is_embedded_ds: bool = False
+        self,
+        datasource: dict,
+        workbook: Optional[dict] = None,
+        is_embedded_ds: bool = False,
    ) -> Iterable[MetadataWorkUnit]:
        datasource_info = workbook
        if not is_embedded_ds:
--- a/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py
@ -173,7 +173,7 @@ READ_STATEMENT_TYPES: List[str] = ["SELECT"]
 def bigquery_audit_metadata_query_template(
    dataset: str,
    use_date_sharded_tables: bool,
-    table_allow_filter: str = None,
+    table_allow_filter: Optional[str] = None,
 ) -> str:
    """
    Receives a dataset (with project specified) and returns a query template that is used to query exported
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@ -306,7 +306,7 @@ class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig):

    def get_sql_alchemy_url(
        self,
-        database: str = None,
+        database: Optional[str] = None,
        username: Optional[str] = None,
        password: Optional[pydantic.SecretStr] = None,
        role: Optional[str] = None,
--- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py
+++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py
@ -110,9 +110,9 @@ class DataHubValidationAction(ValidationAction):
            ValidationResultIdentifier, "GXCloudIdentifier"
        ],
        data_asset: Union[Validator, DataAsset, Batch],
-        payload: Any = None,
+        payload: Optional[Any] = None,
        expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None,
-        checkpoint_identifier: Any = None,
+        checkpoint_identifier: Optional[Any] = None,
    ) -> Dict:
        datasets = []
        try:
--- a/metadata-ingestion/src/datahub/utilities/mapping.py
+++ b/metadata-ingestion/src/datahub/utilities/mapping.py
@ -67,7 +67,7 @@ class OperationProcessor:
        self,
        operation_defs: Dict[str, Dict],
        tag_prefix: str = "",
-        owner_source_type: str = None,
+        owner_source_type: Optional[str] = None,
        strip_owner_email_id: bool = False,
    ):
        self.operation_defs = operation_defs
--- a/metadata-ingestion/src/datahub_provider/lineage/datahub.py
+++ b/metadata-ingestion/src/datahub_provider/lineage/datahub.py
@ -70,7 +70,7 @@ class DatahubLineageBackend(LineageBackend):
        operator: "BaseOperator",
        inlets: Optional[List] = None,  # unused
        outlets: Optional[List] = None,  # unused
-        context: Dict = None,
+        context: Optional[Dict] = None,
    ) -> None:
        config = get_lineage_config()
        if not config.enabled:
--- a/metadata-ingestion/tests/test_helpers/docker_helpers.py
+++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py
@ -24,7 +24,7 @@ def wait_for_port(
    docker_services: pytest_docker.plugin.Services,
    container_name: str,
    container_port: int,
-    hostname: str = None,
+    hostname: Optional[str] = None,
    timeout: float = 30.0,
    pause: float = 0.5,
    checker: Optional[Callable[[], bool]] = None,
--- a/metadata-ingestion/tests/unit/test_sql_common.py
+++ b/metadata-ingestion/tests/unit/test_sql_common.py
@ -4,7 +4,6 @@ from unittest.mock import Mock
 import pytest
 from sqlalchemy.engine.reflection import Inspector

-from datahub.ingestion.api.source import Source
 from datahub.ingestion.source.sql.sql_common import (
    PipelineContext,
    SQLAlchemyConfig,
@ -19,8 +18,6 @@ class _TestSQLAlchemyConfig(SQLAlchemyConfig):


 class _TestSQLAlchemySource(SQLAlchemySource):
-    @classmethod
-    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
    pass