mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 22:55:07 +00:00
Fix(ingest): Deprecate --s3-url in favor of --remote-url (#616)
* deprecation s3-url * changelopg and versioin * download dir not now
This commit is contained in:
parent
7942bc9d5b
commit
ab5f92dd79
@ -1,7 +1,8 @@
|
||||
## 0.6.7-dev7
|
||||
## 0.6.7-dev8
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Deprecate `--s3-url` in favor of `--remote-url` in CLI
|
||||
* Refactor out non-connector-specific config variables
|
||||
* Add `file_directory` to metadata
|
||||
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
|
||||
|
||||
@ -6,10 +6,10 @@ The unstructured library includes a CLI to batch ingest documents from (soon to
|
||||
various) sources, storing structured outputs locally on the filesystem.
|
||||
|
||||
For example, the following command processes all the documents in S3 in the
|
||||
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
|
||||
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
|
||||
|
||||
unstructured-ingest \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
--num-processes 2
|
||||
@ -30,7 +30,7 @@ When testing from a local checkout rather than a pip-installed version of `unstr
|
||||
just execute `unstructured/ingest/main.py`, e.g.:
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
--num-processes 2
|
||||
|
||||
@ -9,7 +9,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
--num-processes 2
|
||||
|
||||
@ -13,7 +13,7 @@ fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--metadata-exclude filename,file_directory \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
--preserve-downloads \
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.7-dev7" # pragma: no cover
|
||||
__version__ = "0.6.7-dev8" # pragma: no cover
|
||||
|
||||
@ -172,12 +172,6 @@ class MainProcess:
|
||||
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
|
||||
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
|
||||
)
|
||||
@click.option(
|
||||
"--s3-url",
|
||||
default=None,
|
||||
help="Prefix of s3 objects (files) to download. E.g. s3://bucket1/path/. This value may "
|
||||
"also be a single file. To be deprecated in favor of --remote-url.",
|
||||
)
|
||||
@click.option(
|
||||
"--s3-anonymous",
|
||||
is_flag=True,
|
||||
@ -399,7 +393,6 @@ class MainProcess:
|
||||
@click.option("-v", "--verbose", is_flag=True, default=False)
|
||||
def main(
|
||||
remote_url,
|
||||
s3_url, # TODO: deprecate this in the next minor release
|
||||
s3_anonymous,
|
||||
azure_account_name,
|
||||
azure_account_key,
|
||||
@ -491,13 +484,6 @@ def main(
|
||||
cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
|
||||
if not cache_path.exists():
|
||||
cache_path.mkdir(parents=True, exist_ok=True)
|
||||
if s3_url:
|
||||
warnings.warn(
|
||||
"The `--s3-url` option will be deprecated in favor of `--remote-url`"
|
||||
" in the next minor release.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
remote_url = s3_url
|
||||
if remote_url:
|
||||
hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
|
||||
elif github_url:
|
||||
@ -561,7 +547,7 @@ def main(
|
||||
doc_connector = S3Connector( # type: ignore
|
||||
standard_config=standard_config,
|
||||
config=SimpleS3Config(
|
||||
path=s3_url,
|
||||
path=remote_url,
|
||||
access_kwargs={"anon": s3_anonymous},
|
||||
),
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user