feat: add google cloud storage connector (#746)

This commit is contained in:
David Potter 2023-06-21 15:14:50 -07:00 committed by GitHub
parent 21c346dab8
commit 3b472cb7df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 514 additions and 54 deletions

View File

@ -195,6 +195,7 @@ jobs:
make install-ingest-s3 make install-ingest-s3
make install-ingest-azure make install-ingest-azure
make install-ingest-discord make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive make install-ingest-google-drive
make install-ingest-github make install-ingest-github
make install-ingest-gitlab make install-ingest-gitlab

View File

@ -73,6 +73,7 @@ jobs:
make install-ingest-s3 make install-ingest-s3
make install-ingest-azure make install-ingest-azure
make install-ingest-discord make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive make install-ingest-google-drive
make install-ingest-github make install-ingest-github
make install-ingest-gitlab make install-ingest-gitlab

View File

@ -1,3 +1,16 @@
## 0.7.8-dev0
### Enhancements
* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag
### Features
* Adds Google Cloud Service connector
### Fixes
## 0.7.7 ## 0.7.7
### Enhancements ### Enhancements

View File

@ -2,6 +2,7 @@ include requirements/base.in
include requirements/huggingface.in include requirements/huggingface.in
include requirements/local-inference.in include requirements/local-inference.in
include requirements/ingest-s3.in include requirements/ingest-s3.in
include requirements/ingest-gcs.in
include requirements/ingest-azure.in include requirements/ingest-azure.in
include requirements/ingest-discord.in include requirements/ingest-discord.in
include requirements/ingest-github.in include requirements/ingest-github.in

View File

@ -62,6 +62,10 @@ install-ingest-google-drive:
install-ingest-s3: install-ingest-s3:
python3 -m pip install -r requirements/ingest-s3.txt python3 -m pip install -r requirements/ingest-s3.txt
.PHONY: install-ingest-gcs
install-ingest-gcs:
python3 -m pip install -r requirements/ingest-gcs.txt
.PHONY: install-ingest-azure .PHONY: install-ingest-azure
install-ingest-azure: install-ingest-azure:
python3 -m pip install -r requirements/ingest-azure.txt python3 -m pip install -r requirements/ingest-azure.txt
@ -117,6 +121,7 @@ pip-compile:
# sphinx docs looks for additional requirements # sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade requirements/ingest-s3.in pip-compile --upgrade requirements/ingest-s3.in
pip-compile --upgrade requirements/ingest-gcs.in
pip-compile --upgrade requirements/ingest-azure.in pip-compile --upgrade requirements/ingest-azure.in
pip-compile --upgrade requirements/ingest-discord.in pip-compile --upgrade requirements/ingest-discord.in
pip-compile --upgrade requirements/ingest-reddit.in pip-compile --upgrade requirements/ingest-reddit.in

View File

@ -26,7 +26,7 @@ idna==3.4
# via requests # via requests
imagesize==1.4.1 imagesize==1.4.1
# via sphinx # via sphinx
importlib-metadata==6.6.0 importlib-metadata==6.7.0
# via sphinx # via sphinx
jinja2==3.1.2 jinja2==3.1.2
# via sphinx # via sphinx

2
examples/ingest/azure/ingest.sh Normal file → Executable file
View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Processes all the files from abfs://container1/ in azureunstructured1 account, # Processes all the files from abfs://container1/ in azureunstructured1 account,
# using the `unstructured` library. # using the `unstructured` library.
# Structured outputs are stored in azure-ingest-output/ # Structured outputs are stored in azure-ingest-output/

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/
# through Unstructured's library in 2 processes.
# Structured outputs are stored in gcs-output/
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url gs://utic-test-ingest-fixtures-public/ \
--structured-output-dir gcs-output \
--num-processes 2 \
--recursive \
--verbose

View File

@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--drive-service-account-key "<path to drive service account key>" \ --drive-service-account-key "<path to drive service account key>" \
--structured-output-dir google-drive-ingest-output \ --structured-output-dir google-drive-ingest-output \
--num-processes 2 \ --num-processes 2 \
--drive-recursive \ --recursive \
--verbose \ --verbose \
# --extension ".docx" # Ensures only .docx files are processed. # --extension ".docx" # Ensures only .docx files are processed.

2
examples/ingest/local/ingest.sh Normal file → Executable file
View File

@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--local-input-path example-docs \ --local-input-path example-docs \
--structured-output-dir local-ingest-output \ --structured-output-dir local-ingest-output \
--num-processes 2 \ --num-processes 2 \
--local-recursive \ --recursive \
--verbose \ --verbose \
# Alternatively, you can call it using: # Alternatively, you can call it using:

View File

@ -6,7 +6,7 @@
# #
anyio==3.7.0 anyio==3.7.0
# via httpcore # via httpcore
argilla==1.9.0 argilla==1.10.0
# via -r requirements/base.in # via -r requirements/base.in
backoff==2.2.1 backoff==2.2.1
# via argilla # via argilla
@ -51,7 +51,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.6.0 importlib-metadata==6.7.0
# via markdown # via markdown
joblib==1.2.0 joblib==1.2.0
# via nltk # via nltk

View File

@ -26,7 +26,7 @@ idna==3.4
# via requests # via requests
imagesize==1.4.1 imagesize==1.4.1
# via sphinx # via sphinx
importlib-metadata==6.6.0 importlib-metadata==6.7.0
# via sphinx # via sphinx
jinja2==3.1.2 jinja2==3.1.2
# via sphinx # via sphinx

View File

@ -75,7 +75,7 @@ idna==3.4
# -c requirements/test.txt # -c requirements/test.txt
# anyio # anyio
# jsonschema # jsonschema
importlib-metadata==6.6.0 importlib-metadata==6.7.0
# via # via
# -c requirements/base.txt # -c requirements/base.txt
# jupyter-client # jupyter-client
@ -113,7 +113,7 @@ jinja2==3.1.2
# nbclassic # nbclassic
# nbconvert # nbconvert
# notebook # notebook
jsonpointer==2.3 jsonpointer==2.4
# via jsonschema # via jsonschema
jsonschema[format-nongpl]==4.17.3 jsonschema[format-nongpl]==4.17.3
# via # via
@ -132,7 +132,7 @@ jupyter-client==8.2.0
# qtconsole # qtconsole
jupyter-console==6.6.3 jupyter-console==6.6.3
# via jupyter # via jupyter
jupyter-core==5.3.0 jupyter-core==5.3.1
# via # via
# -c requirements/constraints.in # -c requirements/constraints.in
# ipykernel # ipykernel
@ -165,13 +165,13 @@ matplotlib-inline==0.1.6
# via # via
# ipykernel # ipykernel
# ipython # ipython
mistune==2.0.5 mistune==3.0.1
# via nbconvert # via nbconvert
nbclassic==1.0.0 nbclassic==1.0.0
# via notebook # via notebook
nbclient==0.8.0 nbclient==0.8.0
# via nbconvert # via nbconvert
nbconvert==7.5.0 nbconvert==7.6.0
# via # via
# jupyter # jupyter
# jupyter-server # jupyter-server
@ -219,7 +219,7 @@ pip-tools==6.13.0
# via -r requirements/dev.in # via -r requirements/dev.in
pkgutil-resolve-name==1.3.10 pkgutil-resolve-name==1.3.10
# via jsonschema # via jsonschema
platformdirs==3.5.3 platformdirs==3.6.0
# via # via
# -c requirements/test.txt # -c requirements/test.txt
# jupyter-core # jupyter-core
@ -359,7 +359,7 @@ typing-extensions==4.6.3
# ipython # ipython
uri-template==1.2.0 uri-template==1.2.0
# via jsonschema # via jsonschema
virtualenv==20.23.0 virtualenv==20.23.1
# via pre-commit # via pre-commit
wcwidth==0.2.6 wcwidth==0.2.6
# via prompt-toolkit # via prompt-toolkit
@ -369,7 +369,7 @@ webencodings==0.5.1
# via # via
# bleach # bleach
# tinycss2 # tinycss2
websocket-client==1.5.3 websocket-client==1.6.0
# via jupyter-server # via jupyter-server
wheel==0.40.0 wheel==0.40.0
# via # via

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
gcsfs
fsspec

105
requirements/ingest-gcs.txt Normal file
View File

@ -0,0 +1,105 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-gcs.in
#
aiohttp==3.8.4
# via gcsfs
aiosignal==1.3.1
# via aiohttp
async-timeout==4.0.2
# via aiohttp
attrs==23.1.0
# via aiohttp
cachetools==5.3.1
# via google-auth
certifi==2023.5.7
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.1.0
# via
# -c requirements/base.txt
# aiohttp
# requests
decorator==5.1.1
# via gcsfs
frozenlist==1.3.3
# via
# aiohttp
# aiosignal
fsspec==2023.6.0
# via
# -r requirements/ingest-gcs.in
# gcsfs
gcsfs==2023.6.0
# via -r requirements/ingest-gcs.in
google-api-core==2.11.1
# via
# google-cloud-core
# google-cloud-storage
google-auth==2.20.0
# via
# gcsfs
# google-api-core
# google-auth-oauthlib
# google-cloud-core
# google-cloud-storage
google-auth-oauthlib==1.0.0
# via gcsfs
google-cloud-core==2.3.2
# via google-cloud-storage
google-cloud-storage==2.9.0
# via gcsfs
google-crc32c==1.5.0
# via google-resumable-media
google-resumable-media==2.5.0
# via google-cloud-storage
googleapis-common-protos==1.59.1
# via google-api-core
idna==3.4
# via
# -c requirements/base.txt
# requests
# yarl
multidict==6.0.4
# via
# aiohttp
# yarl
oauthlib==3.2.2
# via requests-oauthlib
protobuf==3.20.3
# via
# -c requirements/constraints.in
# google-api-core
pyasn1==0.5.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.3.0
# via google-auth
requests==2.31.0
# via
# -c requirements/base.txt
# gcsfs
# google-api-core
# google-cloud-storage
# requests-oauthlib
requests-oauthlib==1.3.1
# via google-auth-oauthlib
rsa==4.9
# via google-auth
six==1.16.0
# via
# -c requirements/base.txt
# google-auth
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# google-auth
# requests
yarl==1.9.2
# via aiohttp

View File

@ -15,9 +15,9 @@ charset-normalizer==3.1.0
# via # via
# -c requirements/base.txt # -c requirements/base.txt
# requests # requests
google-api-core==2.11.0 google-api-core==2.11.1
# via google-api-python-client # via google-api-python-client
google-api-python-client==2.89.0 google-api-python-client==2.90.0
# via -r requirements/ingest-google-drive.in # via -r requirements/ingest-google-drive.in
google-auth==2.20.0 google-auth==2.20.0
# via # via
@ -47,7 +47,7 @@ pyasn1==0.5.0
# rsa # rsa
pyasn1-modules==0.3.0 pyasn1-modules==0.3.0
# via google-auth # via google-auth
pyparsing==3.0.9 pyparsing==3.1.0
# via httplib2 # via httplib2
requests==2.31.0 requests==2.31.0
# via # via

View File

@ -33,5 +33,5 @@ urllib3==1.26.16
# -c requirements/base.txt # -c requirements/base.txt
# -c requirements/constraints.in # -c requirements/constraints.in
# requests # requests
websocket-client==1.5.3 websocket-client==1.6.0
# via praw # via praw

View File

@ -87,7 +87,7 @@ numpy==1.23.5
# transformers # transformers
omegaconf==2.3.0 omegaconf==2.3.0
# via effdet # via effdet
onnxruntime==1.15.0 onnxruntime==1.15.1
# via unstructured-inference # via unstructured-inference
opencv-python==4.7.0.72 opencv-python==4.7.0.72
# via # via
@ -136,7 +136,7 @@ pycparser==2.21
# via # via
# -c requirements/base.txt # -c requirements/base.txt
# cffi # cffi
pyparsing==3.0.9 pyparsing==3.1.0
# via matplotlib # via matplotlib
pytesseract==0.3.10 pytesseract==0.3.10
# via layoutparser # via layoutparser

View File

@ -56,7 +56,7 @@ mccabe==0.7.0
# via flake8 # via flake8
multidict==6.0.4 multidict==6.0.4
# via yarl # via yarl
mypy==1.3.0 mypy==1.4.0
# via -r requirements/test.in # via -r requirements/test.in
mypy-extensions==1.0.0 mypy-extensions==1.0.0
# via # via
@ -69,7 +69,7 @@ packaging==23.1
# pytest # pytest
pathspec==0.11.1 pathspec==0.11.1
# via black # via black
platformdirs==3.5.3 platformdirs==3.6.0
# via black # via black
pluggy==1.0.0 pluggy==1.0.0
# via pytest # via pytest
@ -87,7 +87,7 @@ pytest==7.3.2
# pytest-mock # pytest-mock
pytest-cov==4.1.0 pytest-cov==4.1.0
# via -r requirements/test.in # via -r requirements/test.in
pytest-mock==3.10.0 pytest-mock==3.11.1
# via -r requirements/test.in # via -r requirements/test.in
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
@ -99,7 +99,7 @@ requests==2.31.0
# via # via
# -c requirements/base.txt # -c requirements/base.txt
# label-studio-sdk # label-studio-sdk
ruff==0.0.272 ruff==0.0.273
# via -r requirements/test.in # via -r requirements/test.in
six==1.16.0 six==1.16.0
# via # via

View File

@ -81,6 +81,7 @@ setup(
"slack": load_requirements("requirements/ingest-slack.in"), "slack": load_requirements("requirements/ingest-slack.in"),
"wikipedia": load_requirements("requirements/ingest-wikipedia.in"), "wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
"google-drive": load_requirements("requirements/ingest-google-drive.in"), "google-drive": load_requirements("requirements/ingest-google-drive.in"),
"gcs": load_requirements("requirements/ingest-gcs.in"),
}, },
package_dir={"unstructured": "unstructured"}, package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]}, package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,56 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,56 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,26 @@
[
{
"type": "Table",
"element_id": "f8db6c6e535705336195aa2c1d23d414",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n"
},
{
"type": "Table",
"element_id": "20f5163a43ac6eb04a40d269d3ad0663",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n"
}
]

View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/.. || exit 1
if [[ "$(find test_unstructured_ingest/expected-structured-output/gcs/ -type f -size +1 | wc -l)" -ne 6 ]]; then
echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small."
echo "Did you overwrite test fixtures with bad outputs?"
exit 1
fi
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0
fi
# Create a temporary file
GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE"
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--remote-url gs://utic-test-ingest-fixtures/ \
--structured-output-dir gcs-output \
--gcs-token "$GCP_INGEST_SERVICE_KEY_FILE" \
--recursive \
--preserve-downloads \
--reprocess
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
set +e
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
EXPECTED_DIR=test_unstructured_ingest/expected-structured-output/gcs
[ -d "$EXPECTED_DIR" ] && rm -rf "$EXPECTED_DIR"
cp -R gcs-output $EXPECTED_DIR
elif ! diff -ru test_unstructured_ingest/expected-structured-output/gcs gcs-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI,"
echo
exit 1
fi

View File

@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--local-input-path files-ingest-download \ --local-input-path files-ingest-download \
--local-recursive \ --recursive \
--local-file-glob "*.pdf" \ --local-file-glob "*.pdf" \
--structured-output-dir pdf-fast-reprocess-ingest-output \ --structured-output-dir pdf-fast-reprocess-ingest-output \
--partition-strategy fast \ --partition-strategy fast \

View File

@ -20,5 +20,6 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-local.sh ./test_unstructured_ingest/test-ingest-local.sh
./test_unstructured_ingest/test-ingest-slack.sh ./test_unstructured_ingest/test-ingest-slack.sh
./test_unstructured_ingest/test-ingest-against-api.sh ./test_unstructured_ingest/test-ingest-against-api.sh
./test_unstructured_ingest/test-ingest-gcs.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh

View File

@ -1 +1 @@
__version__ = "0.7.7" # pragma: no cover __version__ = "0.7.8-dev0" # pragma: no cover

View File

@ -15,8 +15,11 @@ from unstructured.ingest.logger import logger
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
"s3", "s3",
"s3a",
"abfs", "abfs",
"az", "az",
"gs",
"gcs",
] ]
@ -24,6 +27,7 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
class SimpleFsspecConfig(BaseConnectorConfig): class SimpleFsspecConfig(BaseConnectorConfig):
# fsspec specific options # fsspec specific options
path: str path: str
recursive: bool
access_kwargs: dict = field(default_factory=dict) access_kwargs: dict = field(default_factory=dict)
protocol: str = field(init=False) protocol: str = field(init=False)
path_without_protocol: str = field(init=False) path_without_protocol: str = field(init=False)
@ -81,7 +85,9 @@ class FsspecIngestDoc(BaseIngestDoc):
def has_output(self): def has_output(self):
"""Determine if structured output for this doc already exists.""" """Determine if structured output for this doc already exists."""
return self._output_filename().is_file() and os.path.getsize(self._output_filename()) return self._output_filename().is_file() and os.path.getsize(
self._output_filename(),
)
def _create_full_tmp_dir_path(self): def _create_full_tmp_dir_path(self):
"""Includes "directories" in the object path""" """Includes "directories" in the object path"""
@ -114,19 +120,24 @@ class FsspecIngestDoc(BaseIngestDoc):
output_filename = self._output_filename() output_filename = self._output_filename()
output_filename.parent.mkdir(parents=True, exist_ok=True) output_filename.parent.mkdir(parents=True, exist_ok=True)
with open(output_filename, "w") as output_f: with open(output_filename, "w") as output_f:
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2)) output_f.write(
json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2),
)
logger.info(f"Wrote {output_filename}") logger.info(f"Wrote {output_filename}")
@property @property
def filename(self): def filename(self):
"""The filename of the file after downloading from s3""" """The filename of the file after downloading from cloud"""
return self._tmp_download_file() return self._tmp_download_file()
def cleanup_file(self): def cleanup_file(self):
"""Removes the local copy of the file after successful processing.""" """Removes the local copy of the file after successful processing."""
if not self.standard_config.preserve_downloads and not self.standard_config.download_only: if not self.standard_config.preserve_downloads and not self.standard_config.download_only:
logger.debug(f"Cleaning up {self}") logger.debug(f"Cleaning up {self}")
os.unlink(self._tmp_download_file()) try:
os.unlink(self._tmp_download_file())
except OSError as e: # Don't think we need to raise an exception
logger.debug(f"Failed to remove {self._tmp_download_file()} due to {e}")
class FsspecConnector(BaseConnector): class FsspecConnector(BaseConnector):
@ -151,7 +162,7 @@ class FsspecConnector(BaseConnector):
) )
def cleanup(self, cur_dir=None): def cleanup(self, cur_dir=None):
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files """cleanup linginering empty sub-dirs from cloud paths, but leave remaining files
(and their paths) in tact as that indicates they were not processed""" (and their paths) in tact as that indicates they were not processed"""
if not self.cleanup_files: if not self.cleanup_files:
return return
@ -177,7 +188,26 @@ class FsspecConnector(BaseConnector):
) )
def _list_files(self): def _list_files(self):
return self.fs.ls(self.config.path_without_protocol) if not self.config.recursive:
# fs.ls does not walk directories
# directories that are listed in cloud storage can cause problems
# because they are seen as 0 byte files
return [
x.get("name")
for x in self.fs.ls(self.config.path_without_protocol, detail=True)
if x.get("size") > 0
]
else:
# fs.find will recursively walk directories
# "size" is a common key for all the cloud protocols with fs
return [
k
for k, v in self.fs.find(
self.config.path_without_protocol,
detail=True,
).items()
if v.get("size") > 0
]
def get_ingest_docs(self): def get_ingest_docs(self):
return [ return [

View File

@ -0,0 +1,33 @@
from dataclasses import dataclass
from typing import Type
from unstructured.ingest.connector.fsspec import (
FsspecConnector,
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies
@dataclass
class SimpleGcsConfig(SimpleFsspecConfig):
pass
class GcsIngestDoc(FsspecIngestDoc):
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
def get_file(self):
super().get_file()
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
class GcsConnector(FsspecConnector):
ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
def __init__(
self,
config: SimpleGcsConfig,
standard_config: StandardConnectorConfig,
) -> None:
super().__init__(standard_config, config)

View File

@ -161,12 +161,6 @@ class MainProcess:
default=None, default=None,
help="Path to the location in the local file system that will be processed.", help="Path to the location in the local file system that will be processed.",
) )
@click.option(
"--local-recursive",
is_flag=True,
default=False,
help="Support recursive local file processing.",
)
@click.option( @click.option(
"--local-file-glob", "--local-file-glob",
default=None, default=None,
@ -177,7 +171,14 @@ class MainProcess:
"--remote-url", "--remote-url",
default=None, default=None,
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both " help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.", "a directory or a single file. Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
"and `az`.",
)
@click.option(
"--gcs-token",
default=None,
help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds"
"or get creds from the google metadata service or fall back to anonymous access.",
) )
@click.option( @click.option(
"--s3-anonymous", "--s3-anonymous",
@ -211,13 +212,6 @@ class MainProcess:
default=None, default=None,
help="Path to the Google Drive service account json file.", help="Path to the Google Drive service account json file.",
) )
@click.option(
"--drive-recursive",
is_flag=True,
default=False,
help="Recursively download files in folders from the Google Drive ID, "
"otherwise stop at the files in provided folder level.",
)
@click.option( @click.option(
"--drive-extension", "--drive-extension",
default=None, default=None,
@ -412,17 +406,26 @@ class MainProcess:
show_default=True, show_default=True,
help="Number of parallel processes to process docs in.", help="Number of parallel processes to process docs in.",
) )
@click.option(
"--recursive",
is_flag=True,
default=False,
help="Recursively download files in their respective folders"
"otherwise stop at the files in provided folder level."
" Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
"`az`, `google drive` and `local`.",
)
@click.option("-v", "--verbose", is_flag=True, default=False) @click.option("-v", "--verbose", is_flag=True, default=False)
def main( def main(
ctx, ctx,
remote_url, remote_url,
s3_anonymous, s3_anonymous,
gcs_token,
azure_account_name, azure_account_name,
azure_account_key, azure_account_key,
azure_connection_string, azure_connection_string,
drive_id, drive_id,
drive_service_account_key, drive_service_account_key,
drive_recursive,
drive_extension, drive_extension,
biomed_path, biomed_path,
biomed_api_id, biomed_api_id,
@ -457,6 +460,7 @@ def main(
structured_output_dir, structured_output_dir,
reprocess, reprocess,
num_processes, num_processes,
recursive,
verbose, verbose,
metadata_include, metadata_include,
metadata_exclude, metadata_exclude,
@ -468,7 +472,6 @@ def main(
partition_strategy, partition_strategy,
api_key, api_key,
local_input_path, local_input_path,
local_recursive,
local_file_glob, local_file_glob,
download_only, download_only,
): ):
@ -580,9 +583,21 @@ def main(
standard_config=standard_config, standard_config=standard_config,
config=SimpleS3Config( config=SimpleS3Config(
path=remote_url, path=remote_url,
recursive=recursive,
access_kwargs={"anon": s3_anonymous}, access_kwargs={"anon": s3_anonymous},
), ),
) )
elif protocol in ("gs", "gcs"):
from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig
doc_connector = GcsConnector( # type: ignore
standard_config=standard_config,
config=SimpleGcsConfig(
path=remote_url,
recursive=recursive,
access_kwargs={"token": gcs_token},
),
)
elif protocol in ("abfs", "az"): elif protocol in ("abfs", "az"):
from unstructured.ingest.connector.azure import ( from unstructured.ingest.connector.azure import (
AzureBlobStorageConnector, AzureBlobStorageConnector,
@ -602,14 +617,15 @@ def main(
standard_config=standard_config, standard_config=standard_config,
config=SimpleAzureBlobStorageConfig( config=SimpleAzureBlobStorageConfig(
path=remote_url, path=remote_url,
recursive=recursive,
access_kwargs=access_kwargs, access_kwargs=access_kwargs,
), ),
) )
else: else:
warnings.warn( warnings.warn(
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
" so use it at your own risk. Supported protocols are `s3`, `s3a`, `abfs`," " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
" and `az`.", "`abfs` and `az`.",
UserWarning, UserWarning,
) )
@ -622,6 +638,7 @@ def main(
standard_config=standard_config, standard_config=standard_config,
config=SimpleFsspecConfig( config=SimpleFsspecConfig(
path=remote_url, path=remote_url,
recursive=recursive,
), ),
) )
elif github_url: elif github_url:
@ -721,7 +738,7 @@ def main(
config=SimpleGoogleDriveConfig( config=SimpleGoogleDriveConfig(
drive_id=drive_id, drive_id=drive_id,
service_account_key=drive_service_account_key, service_account_key=drive_service_account_key,
recursive=drive_recursive, recursive=recursive,
extension=drive_extension, extension=drive_extension,
), ),
) )
@ -753,7 +770,7 @@ def main(
standard_config=standard_config, standard_config=standard_config,
config=SimpleLocalConfig( config=SimpleLocalConfig(
input_path=local_input_path, input_path=local_input_path,
recursive=local_recursive, recursive=recursive,
file_glob=local_file_glob, file_glob=local_file_glob,
), ),
) )

View File

@ -35,7 +35,7 @@ def requires_dependencies(
raise ImportError( raise ImportError(
f"Following dependencies are missing: {', '.join(missing_deps)}. " f"Following dependencies are missing: {', '.join(missing_deps)}. "
+ ( + (
f"Please install them using `pip install unstructured[{extras}]`." f"""Please install them using `pip install "unstructured[{extras}]"`."""
if extras if extras
else f"Please install them using `pip install {' '.join(missing_deps)}`." else f"Please install them using `pip install {' '.join(missing_deps)}`."
), ),