dev(ingest): use ruff instead of flake8 (#12359)

This commit is contained in:
Aseem Bansal 2025-01-16 08:19:07 +05:30 committed by GitHub
parent 4cde4aafa1
commit 2226820ad1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 99 additions and 87 deletions

View File

@ -110,14 +110,16 @@ task lint(type: Exec, dependsOn: installDev) {
"source ${venv_name}/bin/activate && set -x && " +
"black --check --diff src/ tests/ examples/ && " +
"isort --check --diff src/ tests/ examples/ && " +
"flake8 --count --statistics src/ tests/ examples/ && " +
"ruff check src/ tests/ examples/ && " +
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
}
task lintFix(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"black src/ tests/ examples/ && " +
"isort src/ tests/ examples/"
"isort src/ tests/ examples/ && " +
"ruff check --fix src/ tests/ examples/"
}
def pytest_default_env = "PYTHONDEVMODE=1"

View File

@ -183,7 +183,7 @@ We use black, isort, flake8, and mypy to ensure consistent code style and qualit
# Assumes: pip install -e '.[dev]' and venv is activated
black src/ tests/
isort src/ tests/
flake8 src/ tests/
ruff check src/ tests/
mypy src/ tests/
```

View File

@ -9,7 +9,7 @@ extend-exclude = '''
^/tmp
'''
include = '\.pyi?$'
target-version = ['py37', 'py38', 'py39', 'py310']
target-version = ['py38', 'py39', 'py310', 'py311']
[tool.isort]
combine_as_imports = true
@ -26,6 +26,52 @@ extraPaths = ['tests']
exclude = ["src/datahub/metadata/"]
ignore_decorators = ["@click.*", "@validator", "@root_validator", "@pydantic.validator", "@pydantic.root_validator", "@pytest.fixture"]
ignore_names = ["*Source", "*Sink", "*Report"]
# min_confidence = 80
paths = ["src"]
sort_by_size = true
[tool.ruff]
# Same as Black.
line-length = 88
# Exclude directories matching these patterns.
exclude = [
".git",
"src/datahub/metadata",
"venv",
".tox",
"__pycache__",
]
[tool.ruff.lint]
select = [
"B",
"C90",
"E",
"F",
"TID",
]
ignore = [
# Ignore line length violations (handled by Black)
"E501",
# Ignore whitespace before ':' (matches Black)
"E203",
# Allow usages of functools.lru_cache
"B019",
# Allow function call in argument defaults
"B008",
# TODO: Enable these later
"B006", # Mutable args
"B007", # Unused loop control variable
"B017", # Do not assert blind exception
"B904", # Checks for raise statements in exception handlers that lack a from clause
]
[tool.ruff.lint.mccabe]
max-complexity = 20
[tool.ruff.lint.flake8-tidy-imports]
# Disallow all relative imports.
ban-relative-imports = "all"
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]

View File

@ -1,39 +1,3 @@
[flake8]
max-complexity = 20
ignore =
# Ignore: line length issues, since black's formatter will take care of them.
E501,
# Ignore compound statements, since they're used for ellipsis by black
# See https://github.com/psf/black/issues/3887
E704,
# Ignore: 1 blank line required before class docstring.
D203,
# See https://stackoverflow.com/a/57074416.
W503,
# See https://github.com/psf/black/issues/315.
E203,
# Allow usages of functools.lru_cache.
B019,
# This rule flags the use of function calls in argument defaults.
# There's some good reasons to do this, so we're ok with it.
B008,
# TODO: However, we should enable B006 to catch issues with mutable args.
B006,
# TODO: Enable B007 - unused loop control variable.
B007
# TODO: Enable B902 - require self/cls naming.
# TODO: Enable B904 - use raise from in except clauses.
exclude =
.git,
src/datahub/metadata,
venv,
.tox,
__pycache__
per-file-ignores =
# imported but unused
__init__.py: F401, I250
ban-relative-imports = true
[mypy]
plugins =
./tests/test_helpers/sqlalchemy_mypy_plugin.py,

View File

@ -593,9 +593,7 @@ lint_requirements = {
# This is pinned only to avoid spurious errors in CI.
# We should make an effort to keep it up to date.
"black==23.3.0",
"flake8>=6.0.0",
"flake8-tidy-imports>=4.3.0",
"flake8-bugbear==23.3.12",
"ruff==0.9.1",
"isort>=5.7.0",
"mypy==1.10.1",
}

View File

@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
"get_column_max",
"get_column_mean",
"get_column_stdev",
"get_column_stdev",
"get_column_nonnull_count",
"get_column_unique_count",
}

View File

@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
jinja_params[key] = parameters[key].get("default", "")
normalized_query = re.sub(
r"{% form %}(.*){% endform %}",
"",
query,
0,
re.MULTILINE | re.DOTALL,
pattern=r"{% form %}(.*){% endform %}",
repl="",
string=query,
count=0,
flags=re.MULTILINE | re.DOTALL,
)
# Wherever we don't resolve the jinja params, we replace it with NULL

View File

@ -96,7 +96,7 @@ class PowerBiAPI:
url: str = e.request.url if e.request else "URL not available"
self.reporter.warning(
title="Metadata API Timeout",
message=f"Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
context=f"url={url}",
)
@ -173,7 +173,7 @@ class PowerBiAPI:
entity=entity_name,
entity_id=entity_id,
)
except: # It will catch all type of exception
except Exception:
e = self.log_http_error(
message=f"Unable to fetch users for {entity_name}({entity_id})."
)
@ -210,7 +210,7 @@ class PowerBiAPI:
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
)
except:
except Exception:
self.log_http_error(
message=f"Unable to fetch reports for workspace {workspace.name}"
)
@ -260,7 +260,7 @@ class PowerBiAPI:
groups = self._get_resolver().get_groups(filter_=filter_)
except:
except Exception:
self.log_http_error(message="Unable to fetch list of workspaces")
# raise # we want this exception to bubble up
@ -292,7 +292,7 @@ class PowerBiAPI:
modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
self.__config.modified_since
)
except:
except Exception:
self.log_http_error(message="Unable to fetch list of modified workspaces.")
return modified_workspace_ids
@ -303,8 +303,8 @@ class PowerBiAPI:
scan_id = self.__admin_api_resolver.create_scan_job(
workspace_ids=workspace_ids
)
except:
e = self.log_http_error(message=f"Unable to fetch get scan result.")
except Exception:
e = self.log_http_error(message="Unable to fetch get scan result.")
if data_resolver.is_permission_error(cast(Exception, e)):
logger.warning(
"Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "

View File

@ -384,7 +384,6 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
"varchar": StringType,
"char": StringType,
"varbinary": BytesType,
"json": RecordType,
"date": DateType,
"time": TimeType,
"timestamp": TimeType,

View File

@ -1,7 +1,7 @@
from collections import deque
from itertools import chain
from sys import getsizeof
from typing import Any, Callable
from typing import Any, Iterator
def total_size(o: Any, handlers: Any = {}) -> int:
@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
"""
dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
def dict_handler(d: dict) -> Iterator[Any]:
return chain.from_iterable(d.items())
all_handlers = {
tuple: iter,

View File

@ -36,7 +36,7 @@ def my_logging_fn():
logger.warning("This is a warning message")
logger.error("this is an error with no stack trace")
try:
1 / 0
_ = 1 / 0
except ZeroDivisionError:
logger.exception("failed to divide by zero")

View File

@ -9,36 +9,38 @@ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
def test_lossylist_sampling(length, sampling):
l: LossyList[str] = LossyList()
l_dict: LossyList[str] = LossyList()
for i in range(0, length):
l.append(f"{i} Hello World")
l_dict.append(f"{i} Hello World")
assert len(l) == length
assert l.sampled is sampling
assert len(l_dict) == length
assert l_dict.sampled is sampling
if sampling:
assert f"... sampled of {length} total elements" in str(l)
assert f"... sampled of {length} total elements" in str(l_dict)
else:
assert "sampled" not in str(l)
assert "sampled" not in str(l_dict)
list_version = [int(i.split(" ")[0]) for i in l]
list_version = [int(i.split(" ")[0]) for i in l_dict]
print(list_version)
assert sorted(list_version) == list_version
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
def test_lossyset_sampling(length, sampling):
l: LossySet[str] = LossySet()
lossy_set: LossySet[str] = LossySet()
for i in range(0, length):
l.add(f"{i} Hello World")
lossy_set.add(f"{i} Hello World")
assert len(l) == min(10, length)
assert l.sampled is sampling
assert len(lossy_set) == min(10, length)
assert lossy_set.sampled is sampling
if sampling:
assert f"... sampled with at most {length-10} elements missing" in str(l)
assert f"... sampled with at most {length-10} elements missing" in str(
lossy_set
)
else:
assert "sampled" not in str(l)
assert "sampled" not in str(lossy_set)
list_version = [int(i.split(" ")[0]) for i in l]
list_version = [int(i.split(" ")[0]) for i in lossy_set]
set_version = set(list_version)
assert len(list_version) == len(set_version)
@ -49,35 +51,36 @@ def test_lossyset_sampling(length, sampling):
"length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
)
def test_lossydict_sampling(length, sampling, sub_length):
l: LossyDict[int, LossyList[str]] = LossyDict()
lossy_dict: LossyDict[int, LossyList[str]] = LossyDict()
elements_added = 0
element_length_map = {}
for i in range(0, length):
list_length = random.choice(range(1, sub_length))
element_length_map[i] = 0
for _num_elements in range(0, list_length):
if not l.get(i):
if not lossy_dict.get(i):
elements_added += 1
# reset to 0 until we get it back
element_length_map[i] = 0
else:
element_length_map[i] = len(l[i])
element_length_map[i] = len(lossy_dict[i])
current_list = l.get(i, LossyList())
current_list = lossy_dict.get(i, LossyList())
current_list.append(f"{i}:{round(time.time(),2)} Hello World")
l[i] = current_list
lossy_dict[i] = current_list
element_length_map[i] += 1
assert len(l) == min(l.max_elements, length)
assert l.sampled is sampling
assert len(lossy_dict) == min(lossy_dict.max_elements, length)
assert lossy_dict.sampled is sampling
if sampling:
assert re.search("sampled of at most .* entries.", str(l))
assert f"{l.max_elements} sampled of at most {elements_added} entries." in str(
l
assert re.search("sampled of at most .* entries.", str(lossy_dict))
assert (
f"{lossy_dict.max_elements} sampled of at most {elements_added} entries."
in str(lossy_dict)
)
else:
# cheap way to determine that the dict isn't reporting sampled keys
assert not re.search("sampled of at most .* entries.", str(l))
assert not re.search("sampled of at most .* entries.", str(lossy_dict))
for k, v in l.items():
for k, v in lossy_dict.items():
assert len(v) == element_length_map[k]