dev(ingest): use ruff instead of flake8 (#12359)

2025-12-27 09:58:14 +00:00 · 2025-01-16 08:19:07 +05:30 · 2025-01-16 08:19:07 +05:30 · 2226820ad1
commit 2226820ad1
parent 4cde4aafa1
12 changed files with 99 additions and 87 deletions
--- a/metadata-ingestion/build.gradle
+++ b/metadata-ingestion/build.gradle
@ -110,14 +110,16 @@ task lint(type: Exec, dependsOn: installDev) {
    "source ${venv_name}/bin/activate && set -x && " +
    "black --check --diff src/ tests/ examples/ && " +
    "isort --check --diff src/ tests/ examples/ && " +
-    "flake8 --count --statistics src/ tests/ examples/ && " +
+    "ruff check src/ tests/ examples/ && " +
    "mypy --show-traceback --show-error-codes src/ tests/ examples/"
 }
+
 task lintFix(type: Exec, dependsOn: installDev) {
  commandLine 'bash', '-c',
    "source ${venv_name}/bin/activate && set -x && " +
    "black src/ tests/ examples/ && " +
-    "isort src/ tests/ examples/"
+    "isort src/ tests/ examples/ && " +
+    "ruff check --fix src/ tests/ examples/"
 }

 def pytest_default_env = "PYTHONDEVMODE=1"
--- a/metadata-ingestion/developing.md
+++ b/metadata-ingestion/developing.md
@ -183,7 +183,7 @@ We use black, isort, flake8, and mypy to ensure consistent code style and qualit
 # Assumes: pip install -e '.[dev]' and venv is activated
 black src/ tests/
 isort src/ tests/
-flake8 src/ tests/
+ruff check src/ tests/
 mypy src/ tests/
 ```

--- a/metadata-ingestion/pyproject.toml
+++ b/metadata-ingestion/pyproject.toml
@ -9,7 +9,7 @@ extend-exclude = '''
 ^/tmp
 '''
 include = '\.pyi?$'
-target-version = ['py37', 'py38', 'py39', 'py310']
+target-version = ['py38', 'py39', 'py310', 'py311']

 [tool.isort]
 combine_as_imports = true
@ -26,6 +26,52 @@ extraPaths = ['tests']
 exclude = ["src/datahub/metadata/"]
 ignore_decorators = ["@click.*", "@validator", "@root_validator", "@pydantic.validator", "@pydantic.root_validator", "@pytest.fixture"]
 ignore_names = ["*Source", "*Sink", "*Report"]
-# min_confidence = 80
 paths = ["src"]
 sort_by_size = true
+
+[tool.ruff]
+# Same as Black.
+line-length = 88
+# Exclude directories matching these patterns.
+exclude = [
+    ".git",
+    "src/datahub/metadata",
+    "venv",
+    ".tox",
+    "__pycache__",
+]
+
+[tool.ruff.lint]
+select = [
+    "B",
+    "C90",
+    "E",
+    "F",
+    "TID",
+]
+ignore = [
+    # Ignore line length violations (handled by Black)
+    "E501",
+    # Ignore whitespace before ':' (matches Black)
+    "E203",
+    # Allow usages of functools.lru_cache
+    "B019",
+    # Allow function call in argument defaults
+    "B008",
+    # TODO: Enable these later
+    "B006",  # Mutable args
+    "B007",  # Unused loop control variable
+    "B017",  # Do not assert blind exception
+    "B904",  # Checks for raise statements in exception handlers that lack a from clause
+]
+
+[tool.ruff.lint.mccabe]
+max-complexity = 20
+
+[tool.ruff.lint.flake8-tidy-imports]
+# Disallow all relative imports.
+ban-relative-imports = "all"
+
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
--- a/metadata-ingestion/setup.cfg
+++ b/metadata-ingestion/setup.cfg
@ -1,39 +1,3 @@
-[flake8]
-max-complexity = 20
-ignore =
-    # Ignore: line length issues, since black's formatter will take care of them.
-    E501,
-    # Ignore compound statements, since they're used for ellipsis by black
-    # See https://github.com/psf/black/issues/3887
-    E704,
-    # Ignore: 1 blank line required before class docstring.
-    D203,
-    # See https://stackoverflow.com/a/57074416.
-    W503,
-    # See https://github.com/psf/black/issues/315.
-    E203,
-    # Allow usages of functools.lru_cache.
-    B019,
-    # This rule flags the use of function calls in argument defaults.
-    # There's some good reasons to do this, so we're ok with it.
-    B008,
-    # TODO: However, we should enable B006 to catch issues with mutable args.
-    B006,
-    # TODO: Enable B007 - unused loop control variable.
-    B007
-    # TODO: Enable B902 - require self/cls naming.
-    # TODO: Enable B904 - use raise from in except clauses.
-exclude =
-    .git,
-    src/datahub/metadata,
-    venv,
-    .tox,
-    __pycache__
-per-file-ignores =
-    # imported but unused
-    __init__.py: F401, I250
-ban-relative-imports = true
-
 [mypy]
 plugins =
    ./tests/test_helpers/sqlalchemy_mypy_plugin.py,
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -593,9 +593,7 @@ lint_requirements = {
    # This is pinned only to avoid spurious errors in CI.
    # We should make an effort to keep it up to date.
    "black==23.3.0",
-    "flake8>=6.0.0",
-    "flake8-tidy-imports>=4.3.0",
-    "flake8-bugbear==23.3.12",
+    "ruff==0.9.1",
    "isort>=5.7.0",
    "mypy==1.10.1",
 }
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
        "get_column_max",
        "get_column_mean",
        "get_column_stdev",
-        "get_column_stdev",
        "get_column_nonnull_count",
        "get_column_unique_count",
    }
--- a/metadata-ingestion/src/datahub/ingestion/source/mode.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py
@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
                        jinja_params[key] = parameters[key].get("default", "")

                normalized_query = re.sub(
-                    r"{% form %}(.*){% endform %}",
-                    "",
-                    query,
-                    0,
-                    re.MULTILINE | re.DOTALL,
+                    pattern=r"{% form %}(.*){% endform %}",
+                    repl="",
+                    string=query,
+                    count=0,
+                    flags=re.MULTILINE | re.DOTALL,
                )

            # Wherever we don't resolve the jinja params, we replace it with NULL
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py
@ -96,7 +96,7 @@ class PowerBiAPI:
            url: str = e.request.url if e.request else "URL not available"
            self.reporter.warning(
                title="Metadata API Timeout",
-                message=f"Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
+                message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
                context=f"url={url}",
            )

@ -173,7 +173,7 @@ class PowerBiAPI:
                entity=entity_name,
                entity_id=entity_id,
            )
-        except:  # It will catch all type of exception
+        except Exception:
            e = self.log_http_error(
                message=f"Unable to fetch users for {entity_name}({entity_id})."
            )
@ -210,7 +210,7 @@ class PowerBiAPI:
                            message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
                            context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
                        )
-        except:
+        except Exception:
            self.log_http_error(
                message=f"Unable to fetch reports for workspace {workspace.name}"
            )
@ -260,7 +260,7 @@ class PowerBiAPI:

            groups = self._get_resolver().get_groups(filter_=filter_)

-        except:
+        except Exception:
            self.log_http_error(message="Unable to fetch list of workspaces")
            # raise  # we want this exception to bubble up

@ -292,7 +292,7 @@ class PowerBiAPI:
            modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
                self.__config.modified_since
            )
-        except:
+        except Exception:
            self.log_http_error(message="Unable to fetch list of modified workspaces.")

        return modified_workspace_ids
@ -303,8 +303,8 @@ class PowerBiAPI:
            scan_id = self.__admin_api_resolver.create_scan_job(
                workspace_ids=workspace_ids
            )
-        except:
-            e = self.log_http_error(message=f"Unable to fetch get scan result.")
+        except Exception:
+            e = self.log_http_error(message="Unable to fetch get scan result.")
            if data_resolver.is_permission_error(cast(Exception, e)):
                logger.warning(
                    "Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@ -384,7 +384,6 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
    "varchar": StringType,
    "char": StringType,
    "varbinary": BytesType,
-    "json": RecordType,
    "date": DateType,
    "time": TimeType,
    "timestamp": TimeType,
--- a/metadata-ingestion/src/datahub/utilities/memory_footprint.py
+++ b/metadata-ingestion/src/datahub/utilities/memory_footprint.py
@ -1,7 +1,7 @@
 from collections import deque
 from itertools import chain
 from sys import getsizeof
-from typing import Any, Callable
+from typing import Any, Iterator


 def total_size(o: Any, handlers: Any = {}) -> int:
@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
    Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
    """

-    dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
+    def dict_handler(d: dict) -> Iterator[Any]:
+        return chain.from_iterable(d.items())

    all_handlers = {
        tuple: iter,
--- a/metadata-ingestion/tests/unit/utilities/test_cli_logging.py
+++ b/metadata-ingestion/tests/unit/utilities/test_cli_logging.py
@ -36,7 +36,7 @@ def my_logging_fn():
    logger.warning("This is a warning message")
    logger.error("this is an error with no stack trace")
    try:
-        1 / 0
+        _ = 1 / 0
    except ZeroDivisionError:
        logger.exception("failed to divide by zero")

--- a/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py
+++ b/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py
@ -9,36 +9,38 @@ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet

@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
 def test_lossylist_sampling(length, sampling):
-    l: LossyList[str] = LossyList()
+    l_dict: LossyList[str] = LossyList()
    for i in range(0, length):
-        l.append(f"{i} Hello World")
+        l_dict.append(f"{i} Hello World")

-    assert len(l) == length
-    assert l.sampled is sampling
+    assert len(l_dict) == length
+    assert l_dict.sampled is sampling
    if sampling:
-        assert f"... sampled of {length} total elements" in str(l)
+        assert f"... sampled of {length} total elements" in str(l_dict)
    else:
-        assert "sampled" not in str(l)
+        assert "sampled" not in str(l_dict)

-    list_version = [int(i.split(" ")[0]) for i in l]
+    list_version = [int(i.split(" ")[0]) for i in l_dict]
    print(list_version)
    assert sorted(list_version) == list_version


@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
 def test_lossyset_sampling(length, sampling):
-    l: LossySet[str] = LossySet()
+    lossy_set: LossySet[str] = LossySet()
    for i in range(0, length):
-        l.add(f"{i} Hello World")
+        lossy_set.add(f"{i} Hello World")

-    assert len(l) == min(10, length)
-    assert l.sampled is sampling
+    assert len(lossy_set) == min(10, length)
+    assert lossy_set.sampled is sampling
    if sampling:
-        assert f"... sampled with at most {length-10} elements missing" in str(l)
+        assert f"... sampled with at most {length-10} elements missing" in str(
+            lossy_set
+        )
    else:
-        assert "sampled" not in str(l)
+        assert "sampled" not in str(lossy_set)

-    list_version = [int(i.split(" ")[0]) for i in l]
+    list_version = [int(i.split(" ")[0]) for i in lossy_set]
    set_version = set(list_version)

    assert len(list_version) == len(set_version)
@ -49,35 +51,36 @@ def test_lossyset_sampling(length, sampling):
    "length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
 )
 def test_lossydict_sampling(length, sampling, sub_length):
-    l: LossyDict[int, LossyList[str]] = LossyDict()
+    lossy_dict: LossyDict[int, LossyList[str]] = LossyDict()
    elements_added = 0
    element_length_map = {}
    for i in range(0, length):
        list_length = random.choice(range(1, sub_length))
        element_length_map[i] = 0
        for _num_elements in range(0, list_length):
-            if not l.get(i):
+            if not lossy_dict.get(i):
                elements_added += 1
                # reset to 0 until we get it back
                element_length_map[i] = 0
            else:
-                element_length_map[i] = len(l[i])
+                element_length_map[i] = len(lossy_dict[i])

-            current_list = l.get(i, LossyList())
+            current_list = lossy_dict.get(i, LossyList())
            current_list.append(f"{i}:{round(time.time(),2)} Hello World")
-            l[i] = current_list
+            lossy_dict[i] = current_list
            element_length_map[i] += 1

-    assert len(l) == min(l.max_elements, length)
-    assert l.sampled is sampling
+    assert len(lossy_dict) == min(lossy_dict.max_elements, length)
+    assert lossy_dict.sampled is sampling
    if sampling:
-        assert re.search("sampled of at most .* entries.", str(l))
-        assert f"{l.max_elements} sampled of at most {elements_added} entries." in str(
-            l
+        assert re.search("sampled of at most .* entries.", str(lossy_dict))
+        assert (
+            f"{lossy_dict.max_elements} sampled of at most {elements_added} entries."
+            in str(lossy_dict)
        )
    else:
        # cheap way to determine that the dict isn't reporting sampled keys
-        assert not re.search("sampled of at most .* entries.", str(l))
+        assert not re.search("sampled of at most .* entries.", str(lossy_dict))

-    for k, v in l.items():
+    for k, v in lossy_dict.items():
        assert len(v) == element_length_map[k]