mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-26 09:26:22 +00:00
dev(ingest): use ruff instead of flake8 (#12359)
This commit is contained in:
parent
4cde4aafa1
commit
2226820ad1
@ -110,14 +110,16 @@ task lint(type: Exec, dependsOn: installDev) {
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"black --check --diff src/ tests/ examples/ && " +
|
||||
"isort --check --diff src/ tests/ examples/ && " +
|
||||
"flake8 --count --statistics src/ tests/ examples/ && " +
|
||||
"ruff check src/ tests/ examples/ && " +
|
||||
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
|
||||
}
|
||||
|
||||
task lintFix(type: Exec, dependsOn: installDev) {
|
||||
commandLine 'bash', '-c',
|
||||
"source ${venv_name}/bin/activate && set -x && " +
|
||||
"black src/ tests/ examples/ && " +
|
||||
"isort src/ tests/ examples/"
|
||||
"isort src/ tests/ examples/ && " +
|
||||
"ruff check --fix src/ tests/ examples/"
|
||||
}
|
||||
|
||||
def pytest_default_env = "PYTHONDEVMODE=1"
|
||||
|
||||
@ -183,7 +183,7 @@ We use black, isort, flake8, and mypy to ensure consistent code style and qualit
|
||||
# Assumes: pip install -e '.[dev]' and venv is activated
|
||||
black src/ tests/
|
||||
isort src/ tests/
|
||||
flake8 src/ tests/
|
||||
ruff check src/ tests/
|
||||
mypy src/ tests/
|
||||
```
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ extend-exclude = '''
|
||||
^/tmp
|
||||
'''
|
||||
include = '\.pyi?$'
|
||||
target-version = ['py37', 'py38', 'py39', 'py310']
|
||||
target-version = ['py38', 'py39', 'py310', 'py311']
|
||||
|
||||
[tool.isort]
|
||||
combine_as_imports = true
|
||||
@ -26,6 +26,52 @@ extraPaths = ['tests']
|
||||
exclude = ["src/datahub/metadata/"]
|
||||
ignore_decorators = ["@click.*", "@validator", "@root_validator", "@pydantic.validator", "@pydantic.root_validator", "@pytest.fixture"]
|
||||
ignore_names = ["*Source", "*Sink", "*Report"]
|
||||
# min_confidence = 80
|
||||
paths = ["src"]
|
||||
sort_by_size = true
|
||||
|
||||
[tool.ruff]
|
||||
# Same as Black.
|
||||
line-length = 88
|
||||
# Exclude directories matching these patterns.
|
||||
exclude = [
|
||||
".git",
|
||||
"src/datahub/metadata",
|
||||
"venv",
|
||||
".tox",
|
||||
"__pycache__",
|
||||
]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"B",
|
||||
"C90",
|
||||
"E",
|
||||
"F",
|
||||
"TID",
|
||||
]
|
||||
ignore = [
|
||||
# Ignore line length violations (handled by Black)
|
||||
"E501",
|
||||
# Ignore whitespace before ':' (matches Black)
|
||||
"E203",
|
||||
# Allow usages of functools.lru_cache
|
||||
"B019",
|
||||
# Allow function call in argument defaults
|
||||
"B008",
|
||||
# TODO: Enable these later
|
||||
"B006", # Mutable args
|
||||
"B007", # Unused loop control variable
|
||||
"B017", # Do not assert blind exception
|
||||
"B904", # Checks for raise statements in exception handlers that lack a from clause
|
||||
]
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 20
|
||||
|
||||
[tool.ruff.lint.flake8-tidy-imports]
|
||||
# Disallow all relative imports.
|
||||
ban-relative-imports = "all"
|
||||
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"__init__.py" = ["F401"]
|
||||
@ -1,39 +1,3 @@
|
||||
[flake8]
|
||||
max-complexity = 20
|
||||
ignore =
|
||||
# Ignore: line length issues, since black's formatter will take care of them.
|
||||
E501,
|
||||
# Ignore compound statements, since they're used for ellipsis by black
|
||||
# See https://github.com/psf/black/issues/3887
|
||||
E704,
|
||||
# Ignore: 1 blank line required before class docstring.
|
||||
D203,
|
||||
# See https://stackoverflow.com/a/57074416.
|
||||
W503,
|
||||
# See https://github.com/psf/black/issues/315.
|
||||
E203,
|
||||
# Allow usages of functools.lru_cache.
|
||||
B019,
|
||||
# This rule flags the use of function calls in argument defaults.
|
||||
# There's some good reasons to do this, so we're ok with it.
|
||||
B008,
|
||||
# TODO: However, we should enable B006 to catch issues with mutable args.
|
||||
B006,
|
||||
# TODO: Enable B007 - unused loop control variable.
|
||||
B007
|
||||
# TODO: Enable B902 - require self/cls naming.
|
||||
# TODO: Enable B904 - use raise from in except clauses.
|
||||
exclude =
|
||||
.git,
|
||||
src/datahub/metadata,
|
||||
venv,
|
||||
.tox,
|
||||
__pycache__
|
||||
per-file-ignores =
|
||||
# imported but unused
|
||||
__init__.py: F401, I250
|
||||
ban-relative-imports = true
|
||||
|
||||
[mypy]
|
||||
plugins =
|
||||
./tests/test_helpers/sqlalchemy_mypy_plugin.py,
|
||||
|
||||
@ -593,9 +593,7 @@ lint_requirements = {
|
||||
# This is pinned only to avoid spurious errors in CI.
|
||||
# We should make an effort to keep it up to date.
|
||||
"black==23.3.0",
|
||||
"flake8>=6.0.0",
|
||||
"flake8-tidy-imports>=4.3.0",
|
||||
"flake8-bugbear==23.3.12",
|
||||
"ruff==0.9.1",
|
||||
"isort>=5.7.0",
|
||||
"mypy==1.10.1",
|
||||
}
|
||||
|
||||
@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
|
||||
"get_column_max",
|
||||
"get_column_mean",
|
||||
"get_column_stdev",
|
||||
"get_column_stdev",
|
||||
"get_column_nonnull_count",
|
||||
"get_column_unique_count",
|
||||
}
|
||||
|
||||
@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
||||
jinja_params[key] = parameters[key].get("default", "")
|
||||
|
||||
normalized_query = re.sub(
|
||||
r"{% form %}(.*){% endform %}",
|
||||
"",
|
||||
query,
|
||||
0,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
pattern=r"{% form %}(.*){% endform %}",
|
||||
repl="",
|
||||
string=query,
|
||||
count=0,
|
||||
flags=re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
# Wherever we don't resolve the jinja params, we replace it with NULL
|
||||
|
||||
@ -96,7 +96,7 @@ class PowerBiAPI:
|
||||
url: str = e.request.url if e.request else "URL not available"
|
||||
self.reporter.warning(
|
||||
title="Metadata API Timeout",
|
||||
message=f"Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
|
||||
message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
|
||||
context=f"url={url}",
|
||||
)
|
||||
|
||||
@ -173,7 +173,7 @@ class PowerBiAPI:
|
||||
entity=entity_name,
|
||||
entity_id=entity_id,
|
||||
)
|
||||
except: # It will catch all type of exception
|
||||
except Exception:
|
||||
e = self.log_http_error(
|
||||
message=f"Unable to fetch users for {entity_name}({entity_id})."
|
||||
)
|
||||
@ -210,7 +210,7 @@ class PowerBiAPI:
|
||||
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
|
||||
context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
self.log_http_error(
|
||||
message=f"Unable to fetch reports for workspace {workspace.name}"
|
||||
)
|
||||
@ -260,7 +260,7 @@ class PowerBiAPI:
|
||||
|
||||
groups = self._get_resolver().get_groups(filter_=filter_)
|
||||
|
||||
except:
|
||||
except Exception:
|
||||
self.log_http_error(message="Unable to fetch list of workspaces")
|
||||
# raise # we want this exception to bubble up
|
||||
|
||||
@ -292,7 +292,7 @@ class PowerBiAPI:
|
||||
modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
|
||||
self.__config.modified_since
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
self.log_http_error(message="Unable to fetch list of modified workspaces.")
|
||||
|
||||
return modified_workspace_ids
|
||||
@ -303,8 +303,8 @@ class PowerBiAPI:
|
||||
scan_id = self.__admin_api_resolver.create_scan_job(
|
||||
workspace_ids=workspace_ids
|
||||
)
|
||||
except:
|
||||
e = self.log_http_error(message=f"Unable to fetch get scan result.")
|
||||
except Exception:
|
||||
e = self.log_http_error(message="Unable to fetch get scan result.")
|
||||
if data_resolver.is_permission_error(cast(Exception, e)):
|
||||
logger.warning(
|
||||
"Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "
|
||||
|
||||
@ -384,7 +384,6 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
|
||||
"varchar": StringType,
|
||||
"char": StringType,
|
||||
"varbinary": BytesType,
|
||||
"json": RecordType,
|
||||
"date": DateType,
|
||||
"time": TimeType,
|
||||
"timestamp": TimeType,
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from collections import deque
|
||||
from itertools import chain
|
||||
from sys import getsizeof
|
||||
from typing import Any, Callable
|
||||
from typing import Any, Iterator
|
||||
|
||||
|
||||
def total_size(o: Any, handlers: Any = {}) -> int:
|
||||
@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
|
||||
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
|
||||
"""
|
||||
|
||||
dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
|
||||
def dict_handler(d: dict) -> Iterator[Any]:
|
||||
return chain.from_iterable(d.items())
|
||||
|
||||
all_handlers = {
|
||||
tuple: iter,
|
||||
|
||||
@ -36,7 +36,7 @@ def my_logging_fn():
|
||||
logger.warning("This is a warning message")
|
||||
logger.error("this is an error with no stack trace")
|
||||
try:
|
||||
1 / 0
|
||||
_ = 1 / 0
|
||||
except ZeroDivisionError:
|
||||
logger.exception("failed to divide by zero")
|
||||
|
||||
|
||||
@ -9,36 +9,38 @@ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
||||
|
||||
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
|
||||
def test_lossylist_sampling(length, sampling):
|
||||
l: LossyList[str] = LossyList()
|
||||
l_dict: LossyList[str] = LossyList()
|
||||
for i in range(0, length):
|
||||
l.append(f"{i} Hello World")
|
||||
l_dict.append(f"{i} Hello World")
|
||||
|
||||
assert len(l) == length
|
||||
assert l.sampled is sampling
|
||||
assert len(l_dict) == length
|
||||
assert l_dict.sampled is sampling
|
||||
if sampling:
|
||||
assert f"... sampled of {length} total elements" in str(l)
|
||||
assert f"... sampled of {length} total elements" in str(l_dict)
|
||||
else:
|
||||
assert "sampled" not in str(l)
|
||||
assert "sampled" not in str(l_dict)
|
||||
|
||||
list_version = [int(i.split(" ")[0]) for i in l]
|
||||
list_version = [int(i.split(" ")[0]) for i in l_dict]
|
||||
print(list_version)
|
||||
assert sorted(list_version) == list_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
|
||||
def test_lossyset_sampling(length, sampling):
|
||||
l: LossySet[str] = LossySet()
|
||||
lossy_set: LossySet[str] = LossySet()
|
||||
for i in range(0, length):
|
||||
l.add(f"{i} Hello World")
|
||||
lossy_set.add(f"{i} Hello World")
|
||||
|
||||
assert len(l) == min(10, length)
|
||||
assert l.sampled is sampling
|
||||
assert len(lossy_set) == min(10, length)
|
||||
assert lossy_set.sampled is sampling
|
||||
if sampling:
|
||||
assert f"... sampled with at most {length-10} elements missing" in str(l)
|
||||
assert f"... sampled with at most {length-10} elements missing" in str(
|
||||
lossy_set
|
||||
)
|
||||
else:
|
||||
assert "sampled" not in str(l)
|
||||
assert "sampled" not in str(lossy_set)
|
||||
|
||||
list_version = [int(i.split(" ")[0]) for i in l]
|
||||
list_version = [int(i.split(" ")[0]) for i in lossy_set]
|
||||
set_version = set(list_version)
|
||||
|
||||
assert len(list_version) == len(set_version)
|
||||
@ -49,35 +51,36 @@ def test_lossyset_sampling(length, sampling):
|
||||
"length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
|
||||
)
|
||||
def test_lossydict_sampling(length, sampling, sub_length):
|
||||
l: LossyDict[int, LossyList[str]] = LossyDict()
|
||||
lossy_dict: LossyDict[int, LossyList[str]] = LossyDict()
|
||||
elements_added = 0
|
||||
element_length_map = {}
|
||||
for i in range(0, length):
|
||||
list_length = random.choice(range(1, sub_length))
|
||||
element_length_map[i] = 0
|
||||
for _num_elements in range(0, list_length):
|
||||
if not l.get(i):
|
||||
if not lossy_dict.get(i):
|
||||
elements_added += 1
|
||||
# reset to 0 until we get it back
|
||||
element_length_map[i] = 0
|
||||
else:
|
||||
element_length_map[i] = len(l[i])
|
||||
element_length_map[i] = len(lossy_dict[i])
|
||||
|
||||
current_list = l.get(i, LossyList())
|
||||
current_list = lossy_dict.get(i, LossyList())
|
||||
current_list.append(f"{i}:{round(time.time(),2)} Hello World")
|
||||
l[i] = current_list
|
||||
lossy_dict[i] = current_list
|
||||
element_length_map[i] += 1
|
||||
|
||||
assert len(l) == min(l.max_elements, length)
|
||||
assert l.sampled is sampling
|
||||
assert len(lossy_dict) == min(lossy_dict.max_elements, length)
|
||||
assert lossy_dict.sampled is sampling
|
||||
if sampling:
|
||||
assert re.search("sampled of at most .* entries.", str(l))
|
||||
assert f"{l.max_elements} sampled of at most {elements_added} entries." in str(
|
||||
l
|
||||
assert re.search("sampled of at most .* entries.", str(lossy_dict))
|
||||
assert (
|
||||
f"{lossy_dict.max_elements} sampled of at most {elements_added} entries."
|
||||
in str(lossy_dict)
|
||||
)
|
||||
else:
|
||||
# cheap way to determine that the dict isn't reporting sampled keys
|
||||
assert not re.search("sampled of at most .* entries.", str(l))
|
||||
assert not re.search("sampled of at most .* entries.", str(lossy_dict))
|
||||
|
||||
for k, v in l.items():
|
||||
for k, v in lossy_dict.items():
|
||||
assert len(v) == element_length_map[k]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user