From 44cfd21a653e23f869526a3c1a83e11d51b42a86 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 2 Dec 2022 13:53:28 -0500 Subject: [PATCH] chore(ingest): bump and pin mypy (#6584) --- metadata-ingestion/setup.cfg | 9 +++++++++ metadata-ingestion/setup.py | 8 ++------ .../src/datahub/api/graphql/operation.py | 2 -- .../src/datahub/configuration/common.py | 2 +- .../src/datahub/ingestion/api/source.py | 7 ++++++- .../src/datahub/ingestion/api/workunit.py | 8 ++++---- .../ingestion/source/bigquery_v2/bigquery_schema.py | 4 +++- .../datahub/ingestion/source/ge_data_profiler.py | 4 ++-- .../src/datahub/ingestion/source/kafka_connect.py | 4 +++- .../src/datahub/ingestion/source/openapi_parser.py | 13 ++++++++----- .../src/datahub/ingestion/source/redash.py | 2 +- .../src/datahub/ingestion/source/salesforce.py | 2 +- .../ingestion/source/snowflake/snowflake_config.py | 2 +- .../src/datahub/ingestion/source/sql/oracle.py | 4 +++- .../src/datahub/ingestion/source/sql/presto.py | 3 ++- .../src/datahub/ingestion/source/sql/redshift.py | 2 +- .../src/datahub/ingestion/source/sql/sql_common.py | 8 ++++---- .../src/datahub/ingestion/source/sql/trino.py | 2 +- .../src/datahub/ingestion/source/tableau.py | 5 ++++- .../ingestion/source/usage/bigquery_usage.py | 2 +- .../ingestion/source_config/sql/snowflake.py | 2 +- .../integrations/great_expectations/action.py | 4 ++-- metadata-ingestion/src/datahub/utilities/mapping.py | 2 +- .../src/datahub_provider/lineage/datahub.py | 2 +- .../tests/test_helpers/docker_helpers.py | 2 +- metadata-ingestion/tests/unit/test_sql_common.py | 5 +---- 26 files changed, 64 insertions(+), 46 deletions(-) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index b6d4f55a09..3f0e8ab611 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -27,6 +27,7 @@ plugins = exclude = ^(venv|build|dist)/ ignore_missing_imports = yes namespace_packages = no +implicit_optional = no strict_optional = yes check_untyped_defs = yes disallow_incomplete_defs = yes @@ -38,8 +39,16 @@ disallow_untyped_defs = no # try to be a bit more strict in certain areas of the codebase [mypy-datahub.*] ignore_missing_imports = no +[mypy-datahub_provider.*] +ignore_missing_imports = no [mypy-tests.*] ignore_missing_imports = no +[mypy-google.protobuf.*] +# mypy sometimes ignores the above ignore_missing_imports = yes +# See https://github.com/python/mypy/issues/10632 and +# https://github.com/python/mypy/issues/10619#issuecomment-1174208395 +# for a discussion of why this happens. +ignore_missing_imports = yes [mypy-datahub.configuration.*] disallow_untyped_defs = yes [mypy-datahub.emitter.*] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 1a744a6fe3..a63164f402 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -385,8 +385,7 @@ mypy_stubs = { "types-ujson>=5.2.0", "types-termcolor>=1.0.0", "types-Deprecated", - # Mypy complains with 4.21.0.0 => error: Library stubs not installed for "google.protobuf.descriptor" - "types-protobuf<4.21.0.0", + "types-protobuf>=4.21.0.1", } base_dev_requirements = { @@ -399,10 +398,7 @@ base_dev_requirements = { "flake8>=3.8.3", "flake8-tidy-imports>=4.3.0", "isort>=5.7.0", - # mypy 0.990 enables namespace packages by default and sets - # no implicit optional to True. - # FIXME: Enable mypy 0.990 when our codebase is fixed. - "mypy>=0.981,<0.990", + "mypy==0.991", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. # Restricting top version to <1.10 until we can fix our types. diff --git a/metadata-ingestion/src/datahub/api/graphql/operation.py b/metadata-ingestion/src/datahub/api/graphql/operation.py index 5e1575e6f7..9cb40ce581 100644 --- a/metadata-ingestion/src/datahub/api/graphql/operation.py +++ b/metadata-ingestion/src/datahub/api/graphql/operation.py @@ -122,8 +122,6 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper "operationType": operation_type, "partition": partition, } - if filter - else None ), }, ) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index e134a5a849..95d852bbe7 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -80,7 +80,7 @@ class OperationalError(PipelineExecutionError): message: str info: dict - def __init__(self, message: str, info: dict = None): + def __init__(self, message: str, info: Optional[dict] = None): self.message = message self.info = info or {} diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 70e5ce7db7..9f3740aa9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -120,7 +120,12 @@ class Source(Closeable, metaclass=ABCMeta): @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": - pass + # Technically, this method should be abstract. However, the @config_class + # decorator automatically generates a create method at runtime if one is + # not defined. Python still treats the class as abstract because it thinks + # the create method is missing. To avoid the class becoming abstract, we + # can't make this method abstract. + raise NotImplementedError('sources must implement "create"') @abstractmethod def get_workunits(self) -> Iterable[WorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/api/workunit.py b/metadata-ingestion/src/datahub/ingestion/api/workunit.py index 522bcd9fbd..53a77798f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/workunit.py +++ b/metadata-ingestion/src/datahub/ingestion/api/workunit.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Iterable, Union, overload +from typing import Iterable, Optional, Union, overload from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.source import WorkUnit @@ -42,9 +42,9 @@ class MetadataWorkUnit(WorkUnit): def __init__( self, id: str, - mce: MetadataChangeEvent = None, - mcp: MetadataChangeProposalWrapper = None, - mcp_raw: MetadataChangeProposal = None, + mce: Optional[MetadataChangeEvent] = None, + mcp: Optional[MetadataChangeProposalWrapper] = None, + mcp_raw: Optional[MetadataChangeProposal] = None, treat_errors_as_warnings: bool = False, ): super().__init__(id) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 5de36facb6..fa475f66d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -2,7 +2,7 @@ import logging from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime, timezone -from typing import Dict, List, Optional +from typing import Dict, List, Optional, cast from google.cloud import bigquery from google.cloud.bigquery.table import RowIterator, TableListItem, TimePartitioning @@ -280,6 +280,8 @@ class BigQueryDataDictionary: def get_datasets_for_project_id( conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: + # FIXME: Due to a bug in BigQuery's type annotations, we need to cast here. + maxResults = cast(int, maxResults) datasets = conn.list_datasets(project_id, max_results=maxResults) return [BigqueryDataset(name=d.dataset_id) for d in datasets] diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index e6782a92e3..8860162660 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -833,8 +833,8 @@ class DatahubGEProfiler: self, query_combiner: SQLAlchemyQueryCombiner, pretty_name: str, - schema: str = None, - table: str = None, + schema: Optional[str] = None, + table: Optional[str] = None, partition: Optional[str] = None, custom_sql: Optional[str] = None, platform: Optional[str] = None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index 0e3487eb92..431f90643d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -123,7 +123,9 @@ def remove_prefix(text: str, prefix: str) -> str: return text -def unquote(string: str, leading_quote: str = '"', trailing_quote: str = None) -> str: +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: """ If string starts and ends with a quote, unquote it """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index f33654daa1..8d04218716 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -2,7 +2,7 @@ import json import re import time import warnings -from typing import Any, Dict, Generator, List, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple import requests import yaml @@ -47,7 +47,10 @@ def flatten2list(d: dict) -> list: def request_call( - url: str, token: str = None, username: str = None, password: str = None + url: str, + token: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, ) -> requests.Response: headers = {"accept": "application/json"} @@ -66,9 +69,9 @@ def request_call( def get_swag_json( url: str, - token: str = None, - username: str = None, - password: str = None, + token: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, swagger_file: str = "", ) -> Dict: tot_url = url + swagger_file diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index c196e1a8cf..3a35c2190f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -402,7 +402,7 @@ class RedashSource(Source): return sql_table_names - def _get_chart_data_source(self, data_source_id: int = None) -> Dict: + def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict: url = f"/api/data_sources/{data_source_id}" resp = self.client._get(url).json() logger.debug(resp) diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py index 7b9ab6dd78..70cb7653f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py +++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py @@ -735,7 +735,7 @@ class SalesforceSource(Source): return self.report -def get_tags(params: List[str] = None) -> GlobalTagsClass: +def get_tags(params: Optional[List[str]] = None) -> GlobalTagsClass: if params is None: params = [] tags = [TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in params if tag] diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 1f61a883cd..43a14228e0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -100,7 +100,7 @@ class SnowflakeV2Config(SnowflakeConfig, SnowflakeUsageConfig): def get_sql_alchemy_url( self, - database: str = None, + database: Optional[str] = None, username: Optional[str] = None, password: Optional[SecretStr] = None, role: Optional[str] = None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index 480ab2c46d..63712b3969 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -103,7 +103,9 @@ class OracleInspectorObjectWrapper: for row in cursor ] - def get_table_names(self, schema: str = None, order_by: str = None) -> List[str]: + def get_table_names( + self, schema: Optional[str] = None, order_by: Optional[str] = None + ) -> List[str]: """ skip order_by, we are not using order_by """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py index 0583520c09..7cd022b76b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Optional from pydantic.fields import Field from pyhive.sqlalchemy_presto import PrestoDialect @@ -60,7 +61,7 @@ def get_view_definition(self, connection, view_name, schema=None, **kw): def _get_full_table( # type: ignore - self, table_name: str, schema: str = None, quote: bool = True + self, table_name: str, schema: Optional[str] = None, quote: bool = True ) -> str: table_part = ( self.identifier_preparer.quote_identifier(table_name) if quote else table_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py index 9a30f2dc86..19419f8d19 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py @@ -691,7 +691,7 @@ class RedshiftSource(SQLAlchemySource): return sources - def get_db_name(self, inspector: Inspector = None) -> str: + def get_db_name(self, inspector: Optional[Inspector] = None) -> str: db_name = getattr(self.config, "database") db_alias = getattr(self.config, "database_alias") if db_alias: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 23df9ec521..6dffda4e83 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -419,8 +419,8 @@ def get_schema_metadata( dataset_name: str, platform: str, columns: List[dict], - pk_constraints: dict = None, - foreign_keys: List[ForeignKeyConstraint] = None, + pk_constraints: Optional[dict] = None, + foreign_keys: Optional[List[ForeignKeyConstraint]] = None, canonical_schema: List[SchemaField] = [], ) -> SchemaMetadata: schema_metadata = SchemaMetadata( @@ -985,7 +985,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase): self, dataset_name: str, columns: List[dict], - pk_constraints: dict = None, + pk_constraints: Optional[dict] = None, tags: Optional[Dict[str, List[str]]] = None, ) -> List[SchemaField]: canonical_schema = [] @@ -1003,7 +1003,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase): self, dataset_name: str, column: dict, - pk_constraints: dict = None, + pk_constraints: Optional[dict] = None, tags: Optional[List[str]] = None, ) -> List[SchemaField]: gtc: Optional[GlobalTagsClass] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index 296bc7a493..277dfe704d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -194,7 +194,7 @@ class TrinoSource(SQLAlchemySource): self, dataset_name: str, column: dict, - pk_constraints: dict = None, + pk_constraints: Optional[dict] = None, tags: Optional[List[str]] = None, ) -> List[SchemaField]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 9434381ff6..5929f1fa2a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -992,7 +992,10 @@ class TableauSource(StatefulIngestionSourceBase): return mcp_workunit def emit_datasource( - self, datasource: dict, workbook: dict = None, is_embedded_ds: bool = False + self, + datasource: dict, + workbook: Optional[dict] = None, + is_embedded_ds: bool = False, ) -> Iterable[MetadataWorkUnit]: datasource_info = workbook if not is_embedded_ds: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py index b9052c35d4..8e7685b904 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py @@ -173,7 +173,7 @@ READ_STATEMENT_TYPES: List[str] = ["SELECT"] def bigquery_audit_metadata_query_template( dataset: str, use_date_sharded_tables: bool, - table_allow_filter: str = None, + table_allow_filter: Optional[str] = None, ) -> str: """ Receives a dataset (with project specified) and returns a query template that is used to query exported diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 8202f877d6..46a40cb990 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -306,7 +306,7 @@ class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig): def get_sql_alchemy_url( self, - database: str = None, + database: Optional[str] = None, username: Optional[str] = None, password: Optional[pydantic.SecretStr] = None, role: Optional[str] = None, diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index 931798f5aa..3b63828657 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -110,9 +110,9 @@ class DataHubValidationAction(ValidationAction): ValidationResultIdentifier, "GXCloudIdentifier" ], data_asset: Union[Validator, DataAsset, Batch], - payload: Any = None, + payload: Optional[Any] = None, expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None, - checkpoint_identifier: Any = None, + checkpoint_identifier: Optional[Any] = None, ) -> Dict: datasets = [] try: diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index f7fb251d3e..13a9fc66dd 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -67,7 +67,7 @@ class OperationProcessor: self, operation_defs: Dict[str, Dict], tag_prefix: str = "", - owner_source_type: str = None, + owner_source_type: Optional[str] = None, strip_owner_email_id: bool = False, ): self.operation_defs = operation_defs diff --git a/metadata-ingestion/src/datahub_provider/lineage/datahub.py b/metadata-ingestion/src/datahub_provider/lineage/datahub.py index fb3728aa53..009ce4bb29 100644 --- a/metadata-ingestion/src/datahub_provider/lineage/datahub.py +++ b/metadata-ingestion/src/datahub_provider/lineage/datahub.py @@ -70,7 +70,7 @@ class DatahubLineageBackend(LineageBackend): operator: "BaseOperator", inlets: Optional[List] = None, # unused outlets: Optional[List] = None, # unused - context: Dict = None, + context: Optional[Dict] = None, ) -> None: config = get_lineage_config() if not config.enabled: diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 7ba82b25a6..0cbae4b2db 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -24,7 +24,7 @@ def wait_for_port( docker_services: pytest_docker.plugin.Services, container_name: str, container_port: int, - hostname: str = None, + hostname: Optional[str] = None, timeout: float = 30.0, pause: float = 0.5, checker: Optional[Callable[[], bool]] = None, diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index 7b86b6ee6a..f382705c70 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -4,7 +4,6 @@ from unittest.mock import Mock import pytest from sqlalchemy.engine.reflection import Inspector -from datahub.ingestion.api.source import Source from datahub.ingestion.source.sql.sql_common import ( PipelineContext, SQLAlchemyConfig, @@ -19,9 +18,7 @@ class _TestSQLAlchemyConfig(SQLAlchemyConfig): class _TestSQLAlchemySource(SQLAlchemySource): - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - pass + pass def test_generate_foreign_key():