mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 11:32:35 +00:00
feat: add requires_dependencies decorator (#302)
* Add `requires_dependencies` decorator * Use `required_dependencies` on Reddit & S3 * Fix bug in `requires_dependencies` To used named args the decorator needs to be also wrapped * Add `requires_dependencies` integration tests * Add `requires_dependencies` in `Competition.md` * Update `CHANGELOG.md` * Bump version 0.4.16-dev5 * Ignore `F401` unused imports in `requires_dependencies` tests * Apply suggestions from code review * Add `functools.wrap` to keep docs, & annotations * Use `requires_dependencies` in `GitHubConnector`
This commit is contained in:
parent
54a6db1c2c
commit
e52dd5c179
@ -2,6 +2,9 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add `requires_dependencies` Python decorator to check dependencies are installed before
|
||||
instantiating a class or running a function
|
||||
|
||||
### Features
|
||||
|
||||
* Added Wikipedia connector for ingest cli.
|
||||
|
||||
@ -67,6 +67,7 @@ In checklist form, the above steps are summarized as:
|
||||
- [ ] Add them as an extra to [setup.py](unstructured/setup.py).
|
||||
- [ ] Update the Makefile, adding a target for `install-ingest-<name>` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference.
|
||||
- [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports.
|
||||
- [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `S3Connector` should look like `@requires_dependencies(dependencies=["boto3"], extras="s3")`
|
||||
- [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured/ingest/interfaces.py](unstructured/ingest/interfaces.py) which is passed through [the CLI](unstructured/ingest/main.py):
|
||||
- [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured/ingest/main.py).
|
||||
- [ ] Unless `.reprocess` is `True`, then documents are always reprocessed.
|
||||
@ -74,4 +75,3 @@ In checklist form, the above steps are summarized as:
|
||||
- [ ] Else if `.preserve_download` is `False`, documents downloaded to `.download_dir` are removed after they are **successfully** processed during the invocation of `MyIngestDoc.cleanup_file()` in [process_document](unstructured/ingest/doc_processor/generalized.py)
|
||||
- [ ] Does not re-download documents to `.download_dir` if `.re_download` is False, enforced in `MyIngestDoc.get_file()`
|
||||
- [ ] Prints more details if `.verbose` similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).
|
||||
|
||||
|
||||
@ -37,3 +37,48 @@ def test_save_as_jsonl(input_data, output_jsonl_file):
|
||||
def test_read_as_jsonl(input_jsonl_file, input_data):
|
||||
file_data = utils.read_from_jsonl(input_jsonl_file)
|
||||
assert file_data == input_data
|
||||
|
||||
|
||||
def test_requires_dependencies_decorator():
|
||||
@utils.requires_dependencies(dependencies="numpy")
|
||||
def test_func():
|
||||
import numpy # noqa: F401
|
||||
|
||||
test_func()
|
||||
|
||||
|
||||
def test_requires_dependencies_decorator_multiple():
|
||||
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
|
||||
def test_func():
|
||||
import numpy # noqa: F401
|
||||
import pandas # noqa: F401
|
||||
|
||||
test_func()
|
||||
|
||||
|
||||
def test_requires_dependencies_decorator_import_error():
|
||||
@utils.requires_dependencies(dependencies="not_a_package")
|
||||
def test_func():
|
||||
import not_a_package # noqa: F401
|
||||
|
||||
with pytest.raises(ImportError):
|
||||
test_func()
|
||||
|
||||
|
||||
def test_requires_dependencies_decorator_import_error_multiple():
|
||||
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
|
||||
def test_func():
|
||||
import not_a_package # noqa: F401
|
||||
import numpy # noqa: F401
|
||||
|
||||
with pytest.raises(ImportError):
|
||||
test_func()
|
||||
|
||||
|
||||
def test_requires_dependencies_decorator_in_class():
|
||||
@utils.requires_dependencies(dependencies="numpy")
|
||||
class TestClass:
|
||||
def __init__(self):
|
||||
import numpy # noqa: F401
|
||||
|
||||
TestClass()
|
||||
|
||||
@ -13,6 +13,7 @@ from unstructured.ingest.interfaces import (
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from github.Repository import Repository
|
||||
@ -124,6 +125,7 @@ class GitHubIngestDoc(BaseIngestDoc):
|
||||
print(f"Wrote {output_filename}")
|
||||
|
||||
|
||||
@requires_dependencies(["pygithub"], extras="github")
|
||||
class GitHubConnector(BaseConnector):
|
||||
def __init__(self, config: SimpleGitHubConfig):
|
||||
from github import Github
|
||||
|
||||
@ -9,6 +9,7 @@ from unstructured.ingest.interfaces import (
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from praw.models import Submission
|
||||
@ -87,6 +88,7 @@ class RedditIngestDoc(BaseIngestDoc):
|
||||
print(f"Wrote {output_filename}")
|
||||
|
||||
|
||||
@requires_dependencies(["praw"], extras="reddit")
|
||||
class RedditConnector(BaseConnector):
|
||||
def __init__(self, config: SimpleRedditConfig):
|
||||
from praw import Reddit
|
||||
|
||||
@ -9,6 +9,7 @@ from unstructured.ingest.interfaces import (
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -84,6 +85,7 @@ class S3IngestDoc(BaseIngestDoc):
|
||||
"""includes "directories" in s3 object path"""
|
||||
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@requires_dependencies(["boto3"], extras="s3")
|
||||
def get_file(self):
|
||||
"""Actually fetches the file from s3 and stores it locally."""
|
||||
import boto3
|
||||
@ -130,6 +132,7 @@ class S3IngestDoc(BaseIngestDoc):
|
||||
os.unlink(self._tmp_download_file())
|
||||
|
||||
|
||||
@requires_dependencies(["boto3"], extras="s3")
|
||||
class S3Connector(BaseConnector):
|
||||
"""Objects of this class support fetching document(s) from"""
|
||||
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import importlib
|
||||
import json
|
||||
from typing import Dict, List
|
||||
from functools import wraps
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
|
||||
def save_as_jsonl(data: List[Dict], filename: str) -> None:
|
||||
@ -10,3 +12,30 @@ def save_as_jsonl(data: List[Dict], filename: str) -> None:
|
||||
def read_from_jsonl(filename: str) -> List[Dict]:
|
||||
with open(filename) as input_file:
|
||||
return [json.loads(line) for line in input_file]
|
||||
|
||||
|
||||
def requires_dependencies(dependencies: Union[str, List[str]], extras: Optional[str] = None):
|
||||
if isinstance(dependencies, str):
|
||||
dependencies = [dependencies]
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
missing_deps = []
|
||||
for dep in dependencies:
|
||||
try:
|
||||
importlib.import_module(dep)
|
||||
except ImportError:
|
||||
missing_deps.append(dep)
|
||||
if len(missing_deps) > 0:
|
||||
raise ImportError(
|
||||
f"Following dependencies are missing: {', '.join(missing_deps)}."
|
||||
+ f"Please install them using `pip install unstructured[{extras}]`."
|
||||
if extras
|
||||
else f"Please install them using `pip install {' '.join(missing_deps)}`.",
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user