feat: add requires_dependencies decorator (#302)

* Add `requires_dependencies` decorator

* Use `required_dependencies` on Reddit & S3

* Fix bug in `requires_dependencies`

To used named args the decorator needs to be also wrapped

* Add `requires_dependencies` integration tests

* Add `requires_dependencies` in `Competition.md`

* Update `CHANGELOG.md`

* Bump version 0.4.16-dev5

* Ignore `F401` unused imports in `requires_dependencies` tests

* Apply suggestions from code review

* Add `functools.wrap` to keep docs, & annotations

* Use `requires_dependencies` in `GitHubConnector`
This commit is contained in:
Alvaro Bartolome 2023-02-28 15:50:39 +01:00 committed by GitHub
parent 54a6db1c2c
commit e52dd5c179
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 86 additions and 2 deletions

View File

@ -2,6 +2,9 @@
### Enhancements
* Add `requires_dependencies` Python decorator to check dependencies are installed before
instantiating a class or running a function
### Features
* Added Wikipedia connector for ingest cli.

View File

@ -67,6 +67,7 @@ In checklist form, the above steps are summarized as:
- [ ] Add them as an extra to [setup.py](unstructured/setup.py).
- [ ] Update the Makefile, adding a target for `install-ingest-<name>` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference.
- [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports.
- [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `S3Connector` should look like `@requires_dependencies(dependencies=["boto3"], extras="s3")`
- [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured/ingest/interfaces.py](unstructured/ingest/interfaces.py) which is passed through [the CLI](unstructured/ingest/main.py):
- [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured/ingest/main.py).
- [ ] Unless `.reprocess` is `True`, then documents are always reprocessed.
@ -74,4 +75,3 @@ In checklist form, the above steps are summarized as:
- [ ] Else if `.preserve_download` is `False`, documents downloaded to `.download_dir` are removed after they are **successfully** processed during the invocation of `MyIngestDoc.cleanup_file()` in [process_document](unstructured/ingest/doc_processor/generalized.py)
- [ ] Does not re-download documents to `.download_dir` if `.re_download` is False, enforced in `MyIngestDoc.get_file()`
- [ ] Prints more details if `.verbose` similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).

View File

@ -37,3 +37,48 @@ def test_save_as_jsonl(input_data, output_jsonl_file):
def test_read_as_jsonl(input_jsonl_file, input_data):
file_data = utils.read_from_jsonl(input_jsonl_file)
assert file_data == input_data
def test_requires_dependencies_decorator():
@utils.requires_dependencies(dependencies="numpy")
def test_func():
import numpy # noqa: F401
test_func()
def test_requires_dependencies_decorator_multiple():
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
def test_func():
import numpy # noqa: F401
import pandas # noqa: F401
test_func()
def test_requires_dependencies_decorator_import_error():
@utils.requires_dependencies(dependencies="not_a_package")
def test_func():
import not_a_package # noqa: F401
with pytest.raises(ImportError):
test_func()
def test_requires_dependencies_decorator_import_error_multiple():
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
def test_func():
import not_a_package # noqa: F401
import numpy # noqa: F401
with pytest.raises(ImportError):
test_func()
def test_requires_dependencies_decorator_in_class():
@utils.requires_dependencies(dependencies="numpy")
class TestClass:
def __init__(self):
import numpy # noqa: F401
TestClass()

View File

@ -13,6 +13,7 @@ from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
)
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from github.Repository import Repository
@ -124,6 +125,7 @@ class GitHubIngestDoc(BaseIngestDoc):
print(f"Wrote {output_filename}")
@requires_dependencies(["pygithub"], extras="github")
class GitHubConnector(BaseConnector):
def __init__(self, config: SimpleGitHubConfig):
from github import Github

View File

@ -9,6 +9,7 @@ from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
)
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from praw.models import Submission
@ -87,6 +88,7 @@ class RedditIngestDoc(BaseIngestDoc):
print(f"Wrote {output_filename}")
@requires_dependencies(["praw"], extras="reddit")
class RedditConnector(BaseConnector):
def __init__(self, config: SimpleRedditConfig):
from praw import Reddit

View File

@ -9,6 +9,7 @@ from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
)
from unstructured.utils import requires_dependencies
@dataclass
@ -84,6 +85,7 @@ class S3IngestDoc(BaseIngestDoc):
"""includes "directories" in s3 object path"""
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
@requires_dependencies(["boto3"], extras="s3")
def get_file(self):
"""Actually fetches the file from s3 and stores it locally."""
import boto3
@ -130,6 +132,7 @@ class S3IngestDoc(BaseIngestDoc):
os.unlink(self._tmp_download_file())
@requires_dependencies(["boto3"], extras="s3")
class S3Connector(BaseConnector):
"""Objects of this class support fetching document(s) from"""

View File

@ -1,5 +1,7 @@
import importlib
import json
from typing import Dict, List
from functools import wraps
from typing import Dict, List, Optional, Union
def save_as_jsonl(data: List[Dict], filename: str) -> None:
@ -10,3 +12,30 @@ def save_as_jsonl(data: List[Dict], filename: str) -> None:
def read_from_jsonl(filename: str) -> List[Dict]:
with open(filename) as input_file:
return [json.loads(line) for line in input_file]
def requires_dependencies(dependencies: Union[str, List[str]], extras: Optional[str] = None):
if isinstance(dependencies, str):
dependencies = [dependencies]
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
missing_deps = []
for dep in dependencies:
try:
importlib.import_module(dep)
except ImportError:
missing_deps.append(dep)
if len(missing_deps) > 0:
raise ImportError(
f"Following dependencies are missing: {', '.join(missing_deps)}."
+ f"Please install them using `pip install unstructured[{extras}]`."
if extras
else f"Please install them using `pip install {' '.join(missing_deps)}`.",
)
return func(*args, **kwargs)
return wrapper
return decorator