Fix(ingest): Deprecate --s3-url in favor of --remote-url (#616)

* deprecation s3-url

* changelopg and versioin

* download dir not now
This commit is contained in:
Yuming Long 2023-05-19 12:11:40 -04:00 committed by GitHub
parent 7942bc9d5b
commit ab5f92dd79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 9 additions and 22 deletions

View File

@ -1,7 +1,8 @@
## 0.6.7-dev7
## 0.6.7-dev8
### Enhancements
* Deprecate `--s3-url` in favor of `--remote-url` in CLI
* Refactor out non-connector-specific config variables
* Add `file_directory` to metadata
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.

View File

@ -6,10 +6,10 @@ The unstructured library includes a CLI to batch ingest documents from (soon to
various) sources, storing structured outputs locally on the filesystem.
For example, the following command processes all the documents in S3 in the
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
unstructured-ingest \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
--num-processes 2
@ -30,7 +30,7 @@ When testing from a local checkout rather than a pip-installed version of `unstr
just execute `unstructured/ingest/main.py`, e.g.:
PYTHONPATH=. ./unstructured/ingest/main.py \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
--num-processes 2

View File

@ -9,7 +9,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
--num-processes 2

View File

@ -13,7 +13,7 @@ fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename,file_directory \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
--preserve-downloads \

View File

@ -1 +1 @@
__version__ = "0.6.7-dev7" # pragma: no cover
__version__ = "0.6.7-dev8" # pragma: no cover

View File

@ -172,12 +172,6 @@ class MainProcess:
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
)
@click.option(
"--s3-url",
default=None,
help="Prefix of s3 objects (files) to download. E.g. s3://bucket1/path/. This value may "
"also be a single file. To be deprecated in favor of --remote-url.",
)
@click.option(
"--s3-anonymous",
is_flag=True,
@ -399,7 +393,6 @@ class MainProcess:
@click.option("-v", "--verbose", is_flag=True, default=False)
def main(
remote_url,
s3_url, # TODO: deprecate this in the next minor release
s3_anonymous,
azure_account_name,
azure_account_key,
@ -491,13 +484,6 @@ def main(
cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
if not cache_path.exists():
cache_path.mkdir(parents=True, exist_ok=True)
if s3_url:
warnings.warn(
"The `--s3-url` option will be deprecated in favor of `--remote-url`"
" in the next minor release.",
DeprecationWarning,
)
remote_url = s3_url
if remote_url:
hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
elif github_url:
@ -561,7 +547,7 @@ def main(
doc_connector = S3Connector( # type: ignore
standard_config=standard_config,
config=SimpleS3Config(
path=s3_url,
path=remote_url,
access_kwargs={"anon": s3_anonymous},
),
)