ci: add job to check imports (#8594)

* try checking imports

* clarify error message

* better fmt

* do not show complete list of successfully imported packages

* refinements

* relnote

* add missing forward references

* better function name

* linting

* fix linting

* Update .github/utils/check_imports.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
This commit is contained in:
Stefano Fiorucci 2024-11-29 15:00:59 +01:00 committed by GitHub
parent 163c06f3d6
commit de7099e560
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 129 additions and 73 deletions

74
.github/utils/check_imports.py vendored Normal file
View File

@ -0,0 +1,74 @@
import os
import sys
import importlib
import traceback
from haystack import logging # pylint: disable=unused-import # this is needed to avoid circular imports
def validate_package_imports(directory: str):
"""
Recursively search for directories with __init__.py and import them.
"""
imported = []
failed = []
sys.path.insert(0, directory)
for root, _, files in os.walk(directory):
# skip directories without __init__.py
if not '__init__.py' in files:
continue
init_path = os.path.join(root, '__init__.py')
# skip haystack/__init__.py to avoid circular imports
if init_path.endswith(f'{directory}/__init__.py'):
continue
# convert filesystem path to Python module name
relative_path = os.path.relpath(root, directory)
module_name = relative_path.replace(os.path.sep, '.')
if module_name == '.':
module_name = os.path.basename(directory)
try:
importlib.import_module(module_name)
imported.append(module_name)
except:
failed.append({
'module': module_name,
'traceback': traceback.format_exc()
})
return imported, failed
def main():
"""
This script checks that all Haystack packages can be imported successfully.
This can detect several issues.
One example is forgetting to use a forward reference for a type hint coming
from a lazy import.
"""
print("Checking imports from Haystack packages...")
imported, failed = validate_package_imports(directory="haystack")
if not imported:
print("\nNO PACKAGES WERE IMPORTED")
sys.exit(1)
print(f"\nSUCCESSFULLY IMPORTED {len(imported)} PACKAGES")
if failed:
print(f"\nFAILED TO IMPORT {len(failed)} PACKAGES:")
for fail in failed:
print(f" - {fail['module']}")
print("\nERRORS:")
for fail in failed:
print(f" - {fail['module']}\n")
print(f" {fail['traceback']}\n\n")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@ -85,6 +85,53 @@ jobs:
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
check-imports:
needs: format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
run: pip install hatch==${{ env.HATCH_VERSION }}
- name: Check imports
run: hatch run python .github/utils/check_imports.py
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
unit-tests:
name: Unit / ${{ matrix.os }}
needs: format

View File

@ -181,7 +181,7 @@ class OpenAPIServiceConnector:
)
return function_payloads
def _authenticate_service(self, openapi_service: OpenAPI, credentials: Optional[Union[dict, str]] = None):
def _authenticate_service(self, openapi_service: "OpenAPI", credentials: Optional[Union[dict, str]] = None):
"""
Authentication with an OpenAPI service.
@ -236,7 +236,7 @@ class OpenAPIServiceConnector:
f"for it. Check the service configuration and credentials."
)
def _invoke_method(self, openapi_service: OpenAPI, method_invocation_descriptor: Dict[str, Any]) -> Any:
def _invoke_method(self, openapi_service: "OpenAPI", method_invocation_descriptor: Dict[str, Any]) -> Any:
"""
Invokes the specified method on the OpenAPI service.

View File

@ -123,10 +123,6 @@ extra-dependencies = [
# Structured logging
"structlog",
# Looking for missing imports
"isort",
"pyproject-parser",
# Test
"pytest",
"pytest-bdd",

View File

@ -0,0 +1,6 @@
---
enhancements:
- |
Add a testing job to check that all packages can be imported successfully.
This should help detecting several issues, such as forgetting to use a
forward reference for a type hint coming from a lazy import.

View File

@ -1,67 +0,0 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import ast
import os
from _ast import Import, ImportFrom
from pathlib import Path
import isort
import toml
from pyproject_parser import PyProject
# Some libraries have different names in the import and in the dependency
# If below test fails due to that, add the library name to the dictionary
LIBRARY_NAMES_TO_MODULE_NAMES = {"python-dateutil": "dateutil"}
# Some standard libraries are not detected by isort. If below test fails due to that, add the library name to the set.
ADDITIONAL_STD_LIBS = {"yaml"}
def test_for_missing_dependencies() -> None:
# We implement this manual check because
# - All tools out there are too powerful because they find all the imports in the haystack package
# - if we import all modules to check the imports we don't find issues with direct dependencies which are also
# sub-dependencies of other dependencies
#### Collect imports
top_level_imports = set()
for path in Path("haystack").glob("**/*.py"):
content = path.read_text(encoding="utf-8")
tree = ast.parse(content)
for item in tree.body:
if isinstance(item, Import):
module = item.names[0].name
elif isinstance(item, ImportFrom) and item.level == 0: # level > 1 are relative imports
module = item.module
else:
# we only care about imports
break
top_level_imports.add(module.split(".")[0])
third_party_modules = {
module
for module in top_level_imports
if isort.place_module(module) == "THIRDPARTY" and module not in ADDITIONAL_STD_LIBS
}
#### Load specified dependencies
parsed = toml.load("pyproject.toml")
# Pyproject complains about our pyproject.toml file, so we need to parse only the dependencies
# We still need `PyProject` to parse the dependencies (think of markers and stuff)
only_dependencies = {"project": {"name": "test", "dependencies": parsed["project"]["dependencies"]}}
project_dependencies = PyProject.project_table_parser.parse(only_dependencies["project"], set_defaults=True)[
"dependencies"
]
project_dependency_modules = set()
for dep in project_dependencies:
if dep.name in LIBRARY_NAMES_TO_MODULE_NAMES:
project_dependency_modules.add(LIBRARY_NAMES_TO_MODULE_NAMES[dep.name])
project_dependency_modules.add(dep.name.replace("-", "_"))
#### And now finally; the check
for module in third_party_modules:
assert module in project_dependency_modules, f"Module {module} is not in the dependencies"