mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add google cloud storage connector (#746)
This commit is contained in:
parent
21c346dab8
commit
3b472cb7df
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -195,6 +195,7 @@ jobs:
|
|||||||
make install-ingest-s3
|
make install-ingest-s3
|
||||||
make install-ingest-azure
|
make install-ingest-azure
|
||||||
make install-ingest-discord
|
make install-ingest-discord
|
||||||
|
make install-ingest-gcs
|
||||||
make install-ingest-google-drive
|
make install-ingest-google-drive
|
||||||
make install-ingest-github
|
make install-ingest-github
|
||||||
make install-ingest-gitlab
|
make install-ingest-gitlab
|
||||||
|
@ -73,6 +73,7 @@ jobs:
|
|||||||
make install-ingest-s3
|
make install-ingest-s3
|
||||||
make install-ingest-azure
|
make install-ingest-azure
|
||||||
make install-ingest-discord
|
make install-ingest-discord
|
||||||
|
make install-ingest-gcs
|
||||||
make install-ingest-google-drive
|
make install-ingest-google-drive
|
||||||
make install-ingest-github
|
make install-ingest-github
|
||||||
make install-ingest-gitlab
|
make install-ingest-gitlab
|
||||||
|
13
CHANGELOG.md
13
CHANGELOG.md
@ -1,3 +1,16 @@
|
|||||||
|
## 0.7.8-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Adds recursive functionality to all fsspec connectors
|
||||||
|
* Adds generic --recursive ingest flag
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Adds Google Cloud Service connector
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.7.7
|
## 0.7.7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -2,6 +2,7 @@ include requirements/base.in
|
|||||||
include requirements/huggingface.in
|
include requirements/huggingface.in
|
||||||
include requirements/local-inference.in
|
include requirements/local-inference.in
|
||||||
include requirements/ingest-s3.in
|
include requirements/ingest-s3.in
|
||||||
|
include requirements/ingest-gcs.in
|
||||||
include requirements/ingest-azure.in
|
include requirements/ingest-azure.in
|
||||||
include requirements/ingest-discord.in
|
include requirements/ingest-discord.in
|
||||||
include requirements/ingest-github.in
|
include requirements/ingest-github.in
|
||||||
|
5
Makefile
5
Makefile
@ -62,6 +62,10 @@ install-ingest-google-drive:
|
|||||||
install-ingest-s3:
|
install-ingest-s3:
|
||||||
python3 -m pip install -r requirements/ingest-s3.txt
|
python3 -m pip install -r requirements/ingest-s3.txt
|
||||||
|
|
||||||
|
.PHONY: install-ingest-gcs
|
||||||
|
install-ingest-gcs:
|
||||||
|
python3 -m pip install -r requirements/ingest-gcs.txt
|
||||||
|
|
||||||
.PHONY: install-ingest-azure
|
.PHONY: install-ingest-azure
|
||||||
install-ingest-azure:
|
install-ingest-azure:
|
||||||
python3 -m pip install -r requirements/ingest-azure.txt
|
python3 -m pip install -r requirements/ingest-azure.txt
|
||||||
@ -117,6 +121,7 @@ pip-compile:
|
|||||||
# sphinx docs looks for additional requirements
|
# sphinx docs looks for additional requirements
|
||||||
cp requirements/build.txt docs/requirements.txt
|
cp requirements/build.txt docs/requirements.txt
|
||||||
pip-compile --upgrade requirements/ingest-s3.in
|
pip-compile --upgrade requirements/ingest-s3.in
|
||||||
|
pip-compile --upgrade requirements/ingest-gcs.in
|
||||||
pip-compile --upgrade requirements/ingest-azure.in
|
pip-compile --upgrade requirements/ingest-azure.in
|
||||||
pip-compile --upgrade requirements/ingest-discord.in
|
pip-compile --upgrade requirements/ingest-discord.in
|
||||||
pip-compile --upgrade requirements/ingest-reddit.in
|
pip-compile --upgrade requirements/ingest-reddit.in
|
||||||
|
@ -26,7 +26,7 @@ idna==3.4
|
|||||||
# via requests
|
# via requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==6.6.0
|
importlib-metadata==6.7.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via sphinx
|
# via sphinx
|
||||||
|
2
examples/ingest/azure/ingest.sh
Normal file → Executable file
2
examples/ingest/azure/ingest.sh
Normal file → Executable file
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Processes all the files from abfs://container1/ in azureunstructured1 account,
|
# Processes all the files from abfs://container1/ in azureunstructured1 account,
|
||||||
# using the `unstructured` library.
|
# using the `unstructured` library.
|
||||||
|
|
||||||
# Structured outputs are stored in azure-ingest-output/
|
# Structured outputs are stored in azure-ingest-output/
|
||||||
|
|
||||||
|
16
examples/ingest/google_cloud_storage/ingest.sh
Executable file
16
examples/ingest/google_cloud_storage/ingest.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/
|
||||||
|
# through Unstructured's library in 2 processes.
|
||||||
|
|
||||||
|
# Structured outputs are stored in gcs-output/
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||||
|
|
||||||
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
|
--remote-url gs://utic-test-ingest-fixtures-public/ \
|
||||||
|
--structured-output-dir gcs-output \
|
||||||
|
--num-processes 2 \
|
||||||
|
--recursive \
|
||||||
|
--verbose
|
@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--drive-service-account-key "<path to drive service account key>" \
|
--drive-service-account-key "<path to drive service account key>" \
|
||||||
--structured-output-dir google-drive-ingest-output \
|
--structured-output-dir google-drive-ingest-output \
|
||||||
--num-processes 2 \
|
--num-processes 2 \
|
||||||
--drive-recursive \
|
--recursive \
|
||||||
--verbose \
|
--verbose \
|
||||||
# --extension ".docx" # Ensures only .docx files are processed.
|
# --extension ".docx" # Ensures only .docx files are processed.
|
||||||
|
|
||||||
|
2
examples/ingest/local/ingest.sh
Normal file → Executable file
2
examples/ingest/local/ingest.sh
Normal file → Executable file
@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--local-input-path example-docs \
|
--local-input-path example-docs \
|
||||||
--structured-output-dir local-ingest-output \
|
--structured-output-dir local-ingest-output \
|
||||||
--num-processes 2 \
|
--num-processes 2 \
|
||||||
--local-recursive \
|
--recursive \
|
||||||
--verbose \
|
--verbose \
|
||||||
|
|
||||||
# Alternatively, you can call it using:
|
# Alternatively, you can call it using:
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#
|
#
|
||||||
anyio==3.7.0
|
anyio==3.7.0
|
||||||
# via httpcore
|
# via httpcore
|
||||||
argilla==1.9.0
|
argilla==1.10.0
|
||||||
# via -r requirements/base.in
|
# via -r requirements/base.in
|
||||||
backoff==2.2.1
|
backoff==2.2.1
|
||||||
# via argilla
|
# via argilla
|
||||||
@ -51,7 +51,7 @@ idna==3.4
|
|||||||
# anyio
|
# anyio
|
||||||
# requests
|
# requests
|
||||||
# rfc3986
|
# rfc3986
|
||||||
importlib-metadata==6.6.0
|
importlib-metadata==6.7.0
|
||||||
# via markdown
|
# via markdown
|
||||||
joblib==1.2.0
|
joblib==1.2.0
|
||||||
# via nltk
|
# via nltk
|
||||||
|
@ -26,7 +26,7 @@ idna==3.4
|
|||||||
# via requests
|
# via requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==6.6.0
|
importlib-metadata==6.7.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via sphinx
|
# via sphinx
|
||||||
|
@ -75,7 +75,7 @@ idna==3.4
|
|||||||
# -c requirements/test.txt
|
# -c requirements/test.txt
|
||||||
# anyio
|
# anyio
|
||||||
# jsonschema
|
# jsonschema
|
||||||
importlib-metadata==6.6.0
|
importlib-metadata==6.7.0
|
||||||
# via
|
# via
|
||||||
# -c requirements/base.txt
|
# -c requirements/base.txt
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@ -113,7 +113,7 @@ jinja2==3.1.2
|
|||||||
# nbclassic
|
# nbclassic
|
||||||
# nbconvert
|
# nbconvert
|
||||||
# notebook
|
# notebook
|
||||||
jsonpointer==2.3
|
jsonpointer==2.4
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
jsonschema[format-nongpl]==4.17.3
|
jsonschema[format-nongpl]==4.17.3
|
||||||
# via
|
# via
|
||||||
@ -132,7 +132,7 @@ jupyter-client==8.2.0
|
|||||||
# qtconsole
|
# qtconsole
|
||||||
jupyter-console==6.6.3
|
jupyter-console==6.6.3
|
||||||
# via jupyter
|
# via jupyter
|
||||||
jupyter-core==5.3.0
|
jupyter-core==5.3.1
|
||||||
# via
|
# via
|
||||||
# -c requirements/constraints.in
|
# -c requirements/constraints.in
|
||||||
# ipykernel
|
# ipykernel
|
||||||
@ -165,13 +165,13 @@ matplotlib-inline==0.1.6
|
|||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
mistune==2.0.5
|
mistune==3.0.1
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbclassic==1.0.0
|
nbclassic==1.0.0
|
||||||
# via notebook
|
# via notebook
|
||||||
nbclient==0.8.0
|
nbclient==0.8.0
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbconvert==7.5.0
|
nbconvert==7.6.0
|
||||||
# via
|
# via
|
||||||
# jupyter
|
# jupyter
|
||||||
# jupyter-server
|
# jupyter-server
|
||||||
@ -219,7 +219,7 @@ pip-tools==6.13.0
|
|||||||
# via -r requirements/dev.in
|
# via -r requirements/dev.in
|
||||||
pkgutil-resolve-name==1.3.10
|
pkgutil-resolve-name==1.3.10
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
platformdirs==3.5.3
|
platformdirs==3.6.0
|
||||||
# via
|
# via
|
||||||
# -c requirements/test.txt
|
# -c requirements/test.txt
|
||||||
# jupyter-core
|
# jupyter-core
|
||||||
@ -359,7 +359,7 @@ typing-extensions==4.6.3
|
|||||||
# ipython
|
# ipython
|
||||||
uri-template==1.2.0
|
uri-template==1.2.0
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
virtualenv==20.23.0
|
virtualenv==20.23.1
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
wcwidth==0.2.6
|
wcwidth==0.2.6
|
||||||
# via prompt-toolkit
|
# via prompt-toolkit
|
||||||
@ -369,7 +369,7 @@ webencodings==0.5.1
|
|||||||
# via
|
# via
|
||||||
# bleach
|
# bleach
|
||||||
# tinycss2
|
# tinycss2
|
||||||
websocket-client==1.5.3
|
websocket-client==1.6.0
|
||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
wheel==0.40.0
|
wheel==0.40.0
|
||||||
# via
|
# via
|
||||||
|
4
requirements/ingest-gcs.in
Normal file
4
requirements/ingest-gcs.in
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
-c constraints.in
|
||||||
|
-c base.txt
|
||||||
|
gcsfs
|
||||||
|
fsspec
|
105
requirements/ingest-gcs.txt
Normal file
105
requirements/ingest-gcs.txt
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with Python 3.8
|
||||||
|
# by the following command:
|
||||||
|
#
|
||||||
|
# pip-compile requirements/ingest-gcs.in
|
||||||
|
#
|
||||||
|
aiohttp==3.8.4
|
||||||
|
# via gcsfs
|
||||||
|
aiosignal==1.3.1
|
||||||
|
# via aiohttp
|
||||||
|
async-timeout==4.0.2
|
||||||
|
# via aiohttp
|
||||||
|
attrs==23.1.0
|
||||||
|
# via aiohttp
|
||||||
|
cachetools==5.3.1
|
||||||
|
# via google-auth
|
||||||
|
certifi==2023.5.7
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# -c requirements/constraints.in
|
||||||
|
# requests
|
||||||
|
charset-normalizer==3.1.0
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# aiohttp
|
||||||
|
# requests
|
||||||
|
decorator==5.1.1
|
||||||
|
# via gcsfs
|
||||||
|
frozenlist==1.3.3
|
||||||
|
# via
|
||||||
|
# aiohttp
|
||||||
|
# aiosignal
|
||||||
|
fsspec==2023.6.0
|
||||||
|
# via
|
||||||
|
# -r requirements/ingest-gcs.in
|
||||||
|
# gcsfs
|
||||||
|
gcsfs==2023.6.0
|
||||||
|
# via -r requirements/ingest-gcs.in
|
||||||
|
google-api-core==2.11.1
|
||||||
|
# via
|
||||||
|
# google-cloud-core
|
||||||
|
# google-cloud-storage
|
||||||
|
google-auth==2.20.0
|
||||||
|
# via
|
||||||
|
# gcsfs
|
||||||
|
# google-api-core
|
||||||
|
# google-auth-oauthlib
|
||||||
|
# google-cloud-core
|
||||||
|
# google-cloud-storage
|
||||||
|
google-auth-oauthlib==1.0.0
|
||||||
|
# via gcsfs
|
||||||
|
google-cloud-core==2.3.2
|
||||||
|
# via google-cloud-storage
|
||||||
|
google-cloud-storage==2.9.0
|
||||||
|
# via gcsfs
|
||||||
|
google-crc32c==1.5.0
|
||||||
|
# via google-resumable-media
|
||||||
|
google-resumable-media==2.5.0
|
||||||
|
# via google-cloud-storage
|
||||||
|
googleapis-common-protos==1.59.1
|
||||||
|
# via google-api-core
|
||||||
|
idna==3.4
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# requests
|
||||||
|
# yarl
|
||||||
|
multidict==6.0.4
|
||||||
|
# via
|
||||||
|
# aiohttp
|
||||||
|
# yarl
|
||||||
|
oauthlib==3.2.2
|
||||||
|
# via requests-oauthlib
|
||||||
|
protobuf==3.20.3
|
||||||
|
# via
|
||||||
|
# -c requirements/constraints.in
|
||||||
|
# google-api-core
|
||||||
|
pyasn1==0.5.0
|
||||||
|
# via
|
||||||
|
# pyasn1-modules
|
||||||
|
# rsa
|
||||||
|
pyasn1-modules==0.3.0
|
||||||
|
# via google-auth
|
||||||
|
requests==2.31.0
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# gcsfs
|
||||||
|
# google-api-core
|
||||||
|
# google-cloud-storage
|
||||||
|
# requests-oauthlib
|
||||||
|
requests-oauthlib==1.3.1
|
||||||
|
# via google-auth-oauthlib
|
||||||
|
rsa==4.9
|
||||||
|
# via google-auth
|
||||||
|
six==1.16.0
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# google-auth
|
||||||
|
urllib3==1.26.16
|
||||||
|
# via
|
||||||
|
# -c requirements/base.txt
|
||||||
|
# -c requirements/constraints.in
|
||||||
|
# google-auth
|
||||||
|
# requests
|
||||||
|
yarl==1.9.2
|
||||||
|
# via aiohttp
|
@ -15,9 +15,9 @@ charset-normalizer==3.1.0
|
|||||||
# via
|
# via
|
||||||
# -c requirements/base.txt
|
# -c requirements/base.txt
|
||||||
# requests
|
# requests
|
||||||
google-api-core==2.11.0
|
google-api-core==2.11.1
|
||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
google-api-python-client==2.89.0
|
google-api-python-client==2.90.0
|
||||||
# via -r requirements/ingest-google-drive.in
|
# via -r requirements/ingest-google-drive.in
|
||||||
google-auth==2.20.0
|
google-auth==2.20.0
|
||||||
# via
|
# via
|
||||||
@ -47,7 +47,7 @@ pyasn1==0.5.0
|
|||||||
# rsa
|
# rsa
|
||||||
pyasn1-modules==0.3.0
|
pyasn1-modules==0.3.0
|
||||||
# via google-auth
|
# via google-auth
|
||||||
pyparsing==3.0.9
|
pyparsing==3.1.0
|
||||||
# via httplib2
|
# via httplib2
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
# via
|
# via
|
||||||
|
@ -33,5 +33,5 @@ urllib3==1.26.16
|
|||||||
# -c requirements/base.txt
|
# -c requirements/base.txt
|
||||||
# -c requirements/constraints.in
|
# -c requirements/constraints.in
|
||||||
# requests
|
# requests
|
||||||
websocket-client==1.5.3
|
websocket-client==1.6.0
|
||||||
# via praw
|
# via praw
|
||||||
|
@ -87,7 +87,7 @@ numpy==1.23.5
|
|||||||
# transformers
|
# transformers
|
||||||
omegaconf==2.3.0
|
omegaconf==2.3.0
|
||||||
# via effdet
|
# via effdet
|
||||||
onnxruntime==1.15.0
|
onnxruntime==1.15.1
|
||||||
# via unstructured-inference
|
# via unstructured-inference
|
||||||
opencv-python==4.7.0.72
|
opencv-python==4.7.0.72
|
||||||
# via
|
# via
|
||||||
@ -136,7 +136,7 @@ pycparser==2.21
|
|||||||
# via
|
# via
|
||||||
# -c requirements/base.txt
|
# -c requirements/base.txt
|
||||||
# cffi
|
# cffi
|
||||||
pyparsing==3.0.9
|
pyparsing==3.1.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
pytesseract==0.3.10
|
pytesseract==0.3.10
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
|
@ -56,7 +56,7 @@ mccabe==0.7.0
|
|||||||
# via flake8
|
# via flake8
|
||||||
multidict==6.0.4
|
multidict==6.0.4
|
||||||
# via yarl
|
# via yarl
|
||||||
mypy==1.3.0
|
mypy==1.4.0
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
mypy-extensions==1.0.0
|
mypy-extensions==1.0.0
|
||||||
# via
|
# via
|
||||||
@ -69,7 +69,7 @@ packaging==23.1
|
|||||||
# pytest
|
# pytest
|
||||||
pathspec==0.11.1
|
pathspec==0.11.1
|
||||||
# via black
|
# via black
|
||||||
platformdirs==3.5.3
|
platformdirs==3.6.0
|
||||||
# via black
|
# via black
|
||||||
pluggy==1.0.0
|
pluggy==1.0.0
|
||||||
# via pytest
|
# via pytest
|
||||||
@ -87,7 +87,7 @@ pytest==7.3.2
|
|||||||
# pytest-mock
|
# pytest-mock
|
||||||
pytest-cov==4.1.0
|
pytest-cov==4.1.0
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
pytest-mock==3.10.0
|
pytest-mock==3.11.1
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
# via
|
# via
|
||||||
@ -99,7 +99,7 @@ requests==2.31.0
|
|||||||
# via
|
# via
|
||||||
# -c requirements/base.txt
|
# -c requirements/base.txt
|
||||||
# label-studio-sdk
|
# label-studio-sdk
|
||||||
ruff==0.0.272
|
ruff==0.0.273
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via
|
# via
|
||||||
|
1
setup.py
1
setup.py
@ -81,6 +81,7 @@ setup(
|
|||||||
"slack": load_requirements("requirements/ingest-slack.in"),
|
"slack": load_requirements("requirements/ingest-slack.in"),
|
||||||
"wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
|
"wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
|
||||||
"google-drive": load_requirements("requirements/ingest-google-drive.in"),
|
"google-drive": load_requirements("requirements/ingest-google-drive.in"),
|
||||||
|
"gcs": load_requirements("requirements/ingest-gcs.in"),
|
||||||
},
|
},
|
||||||
package_dir={"unstructured": "unstructured"},
|
package_dir={"unstructured": "unstructured"},
|
||||||
package_data={"unstructured": ["nlp/*.txt"]},
|
package_data={"unstructured": ["nlp/*.txt"]},
|
||||||
|
@ -0,0 +1,12 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/html",
|
||||||
|
"page_number": 1
|
||||||
|
},
|
||||||
|
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,56 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "This is a test document to use for unit tests."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "Address",
|
||||||
|
"element_id": "a9d4657034aa3fdb5177f1325e912362",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Doylestown, PA 18901"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "Title",
|
||||||
|
"element_id": "9c218520320f238595f1fde74bdd137d",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Important points:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Hamburgers are delicious"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Dogs are the best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "0b61e826b1c4ab05750184da72b89f83",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "I love fuzzy blankets"
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,12 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/html",
|
||||||
|
"page_number": 1
|
||||||
|
},
|
||||||
|
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,56 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "This is a test document to use for unit tests."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "Address",
|
||||||
|
"element_id": "a9d4657034aa3fdb5177f1325e912362",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Doylestown, PA 18901"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "Title",
|
||||||
|
"element_id": "9c218520320f238595f1fde74bdd137d",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Important points:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Hamburgers are delicious"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "Dogs are the best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ListItem",
|
||||||
|
"element_id": "0b61e826b1c4ab05750184da72b89f83",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/plain"
|
||||||
|
},
|
||||||
|
"text": "I love fuzzy blankets"
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,12 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "text/html",
|
||||||
|
"page_number": 1
|
||||||
|
},
|
||||||
|
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,26 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "Table",
|
||||||
|
"element_id": "f8db6c6e535705336195aa2c1d23d414",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"page_number": 1,
|
||||||
|
"page_name": "Stanley Cups",
|
||||||
|
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||||
|
},
|
||||||
|
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "Table",
|
||||||
|
"element_id": "20f5163a43ac6eb04a40d269d3ad0663",
|
||||||
|
"metadata": {
|
||||||
|
"data_source": {},
|
||||||
|
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"page_number": 2,
|
||||||
|
"page_name": "Stanley Cups Since 67",
|
||||||
|
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||||
|
},
|
||||||
|
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n"
|
||||||
|
}
|
||||||
|
]
|
58
test_unstructured_ingest/test-ingest-gcs.sh
Executable file
58
test_unstructured_ingest/test-ingest-gcs.sh
Executable file
@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||||
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
|
||||||
|
if [[ "$(find test_unstructured_ingest/expected-structured-output/gcs/ -type f -size +1 | wc -l)" -ne 6 ]]; then
|
||||||
|
echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small."
|
||||||
|
echo "Did you overwrite test fixtures with bad outputs?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||||
|
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create a temporary file
|
||||||
|
GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
|
||||||
|
echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE"
|
||||||
|
|
||||||
|
|
||||||
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
||||||
|
--remote-url gs://utic-test-ingest-fixtures/ \
|
||||||
|
--structured-output-dir gcs-output \
|
||||||
|
--gcs-token "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||||
|
--recursive \
|
||||||
|
--preserve-downloads \
|
||||||
|
--reprocess
|
||||||
|
|
||||||
|
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||||
|
|
||||||
|
set +e
|
||||||
|
|
||||||
|
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||||
|
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||||
|
EXPECTED_DIR=test_unstructured_ingest/expected-structured-output/gcs
|
||||||
|
[ -d "$EXPECTED_DIR" ] && rm -rf "$EXPECTED_DIR"
|
||||||
|
cp -R gcs-output $EXPECTED_DIR
|
||||||
|
|
||||||
|
elif ! diff -ru test_unstructured_ingest/expected-structured-output/gcs gcs-output ; then
|
||||||
|
echo
|
||||||
|
echo "There are differences from the previously checked-in structured outputs."
|
||||||
|
echo
|
||||||
|
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||||
|
echo
|
||||||
|
echo " export OVERWRITE_FIXTURES=true"
|
||||||
|
echo
|
||||||
|
echo "and then rerun this script."
|
||||||
|
echo
|
||||||
|
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||||
|
echo "to update fixtures for CI,"
|
||||||
|
echo
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
fi
|
@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
||||||
--local-input-path files-ingest-download \
|
--local-input-path files-ingest-download \
|
||||||
--local-recursive \
|
--recursive \
|
||||||
--local-file-glob "*.pdf" \
|
--local-file-glob "*.pdf" \
|
||||||
--structured-output-dir pdf-fast-reprocess-ingest-output \
|
--structured-output-dir pdf-fast-reprocess-ingest-output \
|
||||||
--partition-strategy fast \
|
--partition-strategy fast \
|
||||||
|
@ -20,5 +20,6 @@ export OMP_THREAD_LIMIT=1
|
|||||||
./test_unstructured_ingest/test-ingest-local.sh
|
./test_unstructured_ingest/test-ingest-local.sh
|
||||||
./test_unstructured_ingest/test-ingest-slack.sh
|
./test_unstructured_ingest/test-ingest-slack.sh
|
||||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
./test_unstructured_ingest/test-ingest-against-api.sh
|
||||||
|
./test_unstructured_ingest/test-ingest-gcs.sh
|
||||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.7.7" # pragma: no cover
|
__version__ = "0.7.8-dev0" # pragma: no cover
|
||||||
|
@ -15,8 +15,11 @@ from unstructured.ingest.logger import logger
|
|||||||
|
|
||||||
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
||||||
"s3",
|
"s3",
|
||||||
|
"s3a",
|
||||||
"abfs",
|
"abfs",
|
||||||
"az",
|
"az",
|
||||||
|
"gs",
|
||||||
|
"gcs",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -24,6 +27,7 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
|
|||||||
class SimpleFsspecConfig(BaseConnectorConfig):
|
class SimpleFsspecConfig(BaseConnectorConfig):
|
||||||
# fsspec specific options
|
# fsspec specific options
|
||||||
path: str
|
path: str
|
||||||
|
recursive: bool
|
||||||
access_kwargs: dict = field(default_factory=dict)
|
access_kwargs: dict = field(default_factory=dict)
|
||||||
protocol: str = field(init=False)
|
protocol: str = field(init=False)
|
||||||
path_without_protocol: str = field(init=False)
|
path_without_protocol: str = field(init=False)
|
||||||
@ -81,7 +85,9 @@ class FsspecIngestDoc(BaseIngestDoc):
|
|||||||
|
|
||||||
def has_output(self):
|
def has_output(self):
|
||||||
"""Determine if structured output for this doc already exists."""
|
"""Determine if structured output for this doc already exists."""
|
||||||
return self._output_filename().is_file() and os.path.getsize(self._output_filename())
|
return self._output_filename().is_file() and os.path.getsize(
|
||||||
|
self._output_filename(),
|
||||||
|
)
|
||||||
|
|
||||||
def _create_full_tmp_dir_path(self):
|
def _create_full_tmp_dir_path(self):
|
||||||
"""Includes "directories" in the object path"""
|
"""Includes "directories" in the object path"""
|
||||||
@ -114,19 +120,24 @@ class FsspecIngestDoc(BaseIngestDoc):
|
|||||||
output_filename = self._output_filename()
|
output_filename = self._output_filename()
|
||||||
output_filename.parent.mkdir(parents=True, exist_ok=True)
|
output_filename.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(output_filename, "w") as output_f:
|
with open(output_filename, "w") as output_f:
|
||||||
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
|
output_f.write(
|
||||||
|
json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2),
|
||||||
|
)
|
||||||
logger.info(f"Wrote {output_filename}")
|
logger.info(f"Wrote {output_filename}")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def filename(self):
|
def filename(self):
|
||||||
"""The filename of the file after downloading from s3"""
|
"""The filename of the file after downloading from cloud"""
|
||||||
return self._tmp_download_file()
|
return self._tmp_download_file()
|
||||||
|
|
||||||
def cleanup_file(self):
|
def cleanup_file(self):
|
||||||
"""Removes the local copy of the file after successful processing."""
|
"""Removes the local copy of the file after successful processing."""
|
||||||
if not self.standard_config.preserve_downloads and not self.standard_config.download_only:
|
if not self.standard_config.preserve_downloads and not self.standard_config.download_only:
|
||||||
logger.debug(f"Cleaning up {self}")
|
logger.debug(f"Cleaning up {self}")
|
||||||
os.unlink(self._tmp_download_file())
|
try:
|
||||||
|
os.unlink(self._tmp_download_file())
|
||||||
|
except OSError as e: # Don't think we need to raise an exception
|
||||||
|
logger.debug(f"Failed to remove {self._tmp_download_file()} due to {e}")
|
||||||
|
|
||||||
|
|
||||||
class FsspecConnector(BaseConnector):
|
class FsspecConnector(BaseConnector):
|
||||||
@ -151,7 +162,7 @@ class FsspecConnector(BaseConnector):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def cleanup(self, cur_dir=None):
|
def cleanup(self, cur_dir=None):
|
||||||
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files
|
"""cleanup linginering empty sub-dirs from cloud paths, but leave remaining files
|
||||||
(and their paths) in tact as that indicates they were not processed"""
|
(and their paths) in tact as that indicates they were not processed"""
|
||||||
if not self.cleanup_files:
|
if not self.cleanup_files:
|
||||||
return
|
return
|
||||||
@ -177,7 +188,26 @@ class FsspecConnector(BaseConnector):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _list_files(self):
|
def _list_files(self):
|
||||||
return self.fs.ls(self.config.path_without_protocol)
|
if not self.config.recursive:
|
||||||
|
# fs.ls does not walk directories
|
||||||
|
# directories that are listed in cloud storage can cause problems
|
||||||
|
# because they are seen as 0 byte files
|
||||||
|
return [
|
||||||
|
x.get("name")
|
||||||
|
for x in self.fs.ls(self.config.path_without_protocol, detail=True)
|
||||||
|
if x.get("size") > 0
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# fs.find will recursively walk directories
|
||||||
|
# "size" is a common key for all the cloud protocols with fs
|
||||||
|
return [
|
||||||
|
k
|
||||||
|
for k, v in self.fs.find(
|
||||||
|
self.config.path_without_protocol,
|
||||||
|
detail=True,
|
||||||
|
).items()
|
||||||
|
if v.get("size") > 0
|
||||||
|
]
|
||||||
|
|
||||||
def get_ingest_docs(self):
|
def get_ingest_docs(self):
|
||||||
return [
|
return [
|
||||||
|
33
unstructured/ingest/connector/gcs.py
Normal file
33
unstructured/ingest/connector/gcs.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
from unstructured.ingest.connector.fsspec import (
|
||||||
|
FsspecConnector,
|
||||||
|
FsspecIngestDoc,
|
||||||
|
SimpleFsspecConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.interfaces import StandardConnectorConfig
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SimpleGcsConfig(SimpleFsspecConfig):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class GcsIngestDoc(FsspecIngestDoc):
|
||||||
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
||||||
|
def get_file(self):
|
||||||
|
super().get_file()
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
||||||
|
class GcsConnector(FsspecConnector):
|
||||||
|
ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: SimpleGcsConfig,
|
||||||
|
standard_config: StandardConnectorConfig,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(standard_config, config)
|
@ -161,12 +161,6 @@ class MainProcess:
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to the location in the local file system that will be processed.",
|
help="Path to the location in the local file system that will be processed.",
|
||||||
)
|
)
|
||||||
@click.option(
|
|
||||||
"--local-recursive",
|
|
||||||
is_flag=True,
|
|
||||||
default=False,
|
|
||||||
help="Support recursive local file processing.",
|
|
||||||
)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--local-file-glob",
|
"--local-file-glob",
|
||||||
default=None,
|
default=None,
|
||||||
@ -177,7 +171,14 @@ class MainProcess:
|
|||||||
"--remote-url",
|
"--remote-url",
|
||||||
default=None,
|
default=None,
|
||||||
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
|
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
|
||||||
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
|
"a directory or a single file. Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
|
||||||
|
"and `az`.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--gcs-token",
|
||||||
|
default=None,
|
||||||
|
help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds"
|
||||||
|
"or get creds from the google metadata service or fall back to anonymous access.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--s3-anonymous",
|
"--s3-anonymous",
|
||||||
@ -211,13 +212,6 @@ class MainProcess:
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to the Google Drive service account json file.",
|
help="Path to the Google Drive service account json file.",
|
||||||
)
|
)
|
||||||
@click.option(
|
|
||||||
"--drive-recursive",
|
|
||||||
is_flag=True,
|
|
||||||
default=False,
|
|
||||||
help="Recursively download files in folders from the Google Drive ID, "
|
|
||||||
"otherwise stop at the files in provided folder level.",
|
|
||||||
)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--drive-extension",
|
"--drive-extension",
|
||||||
default=None,
|
default=None,
|
||||||
@ -412,17 +406,26 @@ class MainProcess:
|
|||||||
show_default=True,
|
show_default=True,
|
||||||
help="Number of parallel processes to process docs in.",
|
help="Number of parallel processes to process docs in.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--recursive",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Recursively download files in their respective folders"
|
||||||
|
"otherwise stop at the files in provided folder level."
|
||||||
|
" Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
|
||||||
|
"`az`, `google drive` and `local`.",
|
||||||
|
)
|
||||||
@click.option("-v", "--verbose", is_flag=True, default=False)
|
@click.option("-v", "--verbose", is_flag=True, default=False)
|
||||||
def main(
|
def main(
|
||||||
ctx,
|
ctx,
|
||||||
remote_url,
|
remote_url,
|
||||||
s3_anonymous,
|
s3_anonymous,
|
||||||
|
gcs_token,
|
||||||
azure_account_name,
|
azure_account_name,
|
||||||
azure_account_key,
|
azure_account_key,
|
||||||
azure_connection_string,
|
azure_connection_string,
|
||||||
drive_id,
|
drive_id,
|
||||||
drive_service_account_key,
|
drive_service_account_key,
|
||||||
drive_recursive,
|
|
||||||
drive_extension,
|
drive_extension,
|
||||||
biomed_path,
|
biomed_path,
|
||||||
biomed_api_id,
|
biomed_api_id,
|
||||||
@ -457,6 +460,7 @@ def main(
|
|||||||
structured_output_dir,
|
structured_output_dir,
|
||||||
reprocess,
|
reprocess,
|
||||||
num_processes,
|
num_processes,
|
||||||
|
recursive,
|
||||||
verbose,
|
verbose,
|
||||||
metadata_include,
|
metadata_include,
|
||||||
metadata_exclude,
|
metadata_exclude,
|
||||||
@ -468,7 +472,6 @@ def main(
|
|||||||
partition_strategy,
|
partition_strategy,
|
||||||
api_key,
|
api_key,
|
||||||
local_input_path,
|
local_input_path,
|
||||||
local_recursive,
|
|
||||||
local_file_glob,
|
local_file_glob,
|
||||||
download_only,
|
download_only,
|
||||||
):
|
):
|
||||||
@ -580,9 +583,21 @@ def main(
|
|||||||
standard_config=standard_config,
|
standard_config=standard_config,
|
||||||
config=SimpleS3Config(
|
config=SimpleS3Config(
|
||||||
path=remote_url,
|
path=remote_url,
|
||||||
|
recursive=recursive,
|
||||||
access_kwargs={"anon": s3_anonymous},
|
access_kwargs={"anon": s3_anonymous},
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
elif protocol in ("gs", "gcs"):
|
||||||
|
from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig
|
||||||
|
|
||||||
|
doc_connector = GcsConnector( # type: ignore
|
||||||
|
standard_config=standard_config,
|
||||||
|
config=SimpleGcsConfig(
|
||||||
|
path=remote_url,
|
||||||
|
recursive=recursive,
|
||||||
|
access_kwargs={"token": gcs_token},
|
||||||
|
),
|
||||||
|
)
|
||||||
elif protocol in ("abfs", "az"):
|
elif protocol in ("abfs", "az"):
|
||||||
from unstructured.ingest.connector.azure import (
|
from unstructured.ingest.connector.azure import (
|
||||||
AzureBlobStorageConnector,
|
AzureBlobStorageConnector,
|
||||||
@ -602,14 +617,15 @@ def main(
|
|||||||
standard_config=standard_config,
|
standard_config=standard_config,
|
||||||
config=SimpleAzureBlobStorageConfig(
|
config=SimpleAzureBlobStorageConfig(
|
||||||
path=remote_url,
|
path=remote_url,
|
||||||
|
recursive=recursive,
|
||||||
access_kwargs=access_kwargs,
|
access_kwargs=access_kwargs,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
|
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
|
||||||
" so use it at your own risk. Supported protocols are `s3`, `s3a`, `abfs`,"
|
" so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
|
||||||
" and `az`.",
|
"`abfs` and `az`.",
|
||||||
UserWarning,
|
UserWarning,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -622,6 +638,7 @@ def main(
|
|||||||
standard_config=standard_config,
|
standard_config=standard_config,
|
||||||
config=SimpleFsspecConfig(
|
config=SimpleFsspecConfig(
|
||||||
path=remote_url,
|
path=remote_url,
|
||||||
|
recursive=recursive,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
elif github_url:
|
elif github_url:
|
||||||
@ -721,7 +738,7 @@ def main(
|
|||||||
config=SimpleGoogleDriveConfig(
|
config=SimpleGoogleDriveConfig(
|
||||||
drive_id=drive_id,
|
drive_id=drive_id,
|
||||||
service_account_key=drive_service_account_key,
|
service_account_key=drive_service_account_key,
|
||||||
recursive=drive_recursive,
|
recursive=recursive,
|
||||||
extension=drive_extension,
|
extension=drive_extension,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -753,7 +770,7 @@ def main(
|
|||||||
standard_config=standard_config,
|
standard_config=standard_config,
|
||||||
config=SimpleLocalConfig(
|
config=SimpleLocalConfig(
|
||||||
input_path=local_input_path,
|
input_path=local_input_path,
|
||||||
recursive=local_recursive,
|
recursive=recursive,
|
||||||
file_glob=local_file_glob,
|
file_glob=local_file_glob,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
@ -35,7 +35,7 @@ def requires_dependencies(
|
|||||||
raise ImportError(
|
raise ImportError(
|
||||||
f"Following dependencies are missing: {', '.join(missing_deps)}. "
|
f"Following dependencies are missing: {', '.join(missing_deps)}. "
|
||||||
+ (
|
+ (
|
||||||
f"Please install them using `pip install unstructured[{extras}]`."
|
f"""Please install them using `pip install "unstructured[{extras}]"`."""
|
||||||
if extras
|
if extras
|
||||||
else f"Please install them using `pip install {' '.join(missing_deps)}`."
|
else f"Please install them using `pip install {' '.join(missing_deps)}`."
|
||||||
),
|
),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user