feat: add google cloud storage connector (#746)

This commit is contained in:
David Potter 2023-06-21 15:14:50 -07:00 committed by GitHub
parent 21c346dab8
commit 3b472cb7df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 514 additions and 54 deletions

View File

@ -195,6 +195,7 @@ jobs:
make install-ingest-s3
make install-ingest-azure
make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab

View File

@ -73,6 +73,7 @@ jobs:
make install-ingest-s3
make install-ingest-azure
make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab

View File

@ -1,3 +1,16 @@
## 0.7.8-dev0
### Enhancements
* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag
### Features
* Adds Google Cloud Service connector
### Fixes
## 0.7.7
### Enhancements

View File

@ -2,6 +2,7 @@ include requirements/base.in
include requirements/huggingface.in
include requirements/local-inference.in
include requirements/ingest-s3.in
include requirements/ingest-gcs.in
include requirements/ingest-azure.in
include requirements/ingest-discord.in
include requirements/ingest-github.in

View File

@ -62,6 +62,10 @@ install-ingest-google-drive:
install-ingest-s3:
python3 -m pip install -r requirements/ingest-s3.txt
.PHONY: install-ingest-gcs
install-ingest-gcs:
python3 -m pip install -r requirements/ingest-gcs.txt
.PHONY: install-ingest-azure
install-ingest-azure:
python3 -m pip install -r requirements/ingest-azure.txt
@ -117,6 +121,7 @@ pip-compile:
# sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade requirements/ingest-s3.in
pip-compile --upgrade requirements/ingest-gcs.in
pip-compile --upgrade requirements/ingest-azure.in
pip-compile --upgrade requirements/ingest-discord.in
pip-compile --upgrade requirements/ingest-reddit.in

View File

@ -26,7 +26,7 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via sphinx
jinja2==3.1.2
# via sphinx

2
examples/ingest/azure/ingest.sh Normal file → Executable file
View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash
# Processes all the files from abfs://container1/ in azureunstructured1 account,
# using the `unstructured` library.
# using the `unstructured` library.
# Structured outputs are stored in azure-ingest-output/

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/
# through Unstructured's library in 2 processes.
# Structured outputs are stored in gcs-output/
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url gs://utic-test-ingest-fixtures-public/ \
--structured-output-dir gcs-output \
--num-processes 2 \
--recursive \
--verbose

View File

@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--drive-service-account-key "<path to drive service account key>" \
--structured-output-dir google-drive-ingest-output \
--num-processes 2 \
--drive-recursive \
--recursive \
--verbose \
# --extension ".docx" # Ensures only .docx files are processed.

2
examples/ingest/local/ingest.sh Normal file → Executable file
View File

@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--local-input-path example-docs \
--structured-output-dir local-ingest-output \
--num-processes 2 \
--local-recursive \
--recursive \
--verbose \
# Alternatively, you can call it using:

View File

@ -6,7 +6,7 @@
#
anyio==3.7.0
# via httpcore
argilla==1.9.0
argilla==1.10.0
# via -r requirements/base.in
backoff==2.2.1
# via argilla
@ -51,7 +51,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via markdown
joblib==1.2.0
# via nltk

View File

@ -26,7 +26,7 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via sphinx
jinja2==3.1.2
# via sphinx

View File

@ -75,7 +75,7 @@ idna==3.4
# -c requirements/test.txt
# anyio
# jsonschema
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via
# -c requirements/base.txt
# jupyter-client
@ -113,7 +113,7 @@ jinja2==3.1.2
# nbclassic
# nbconvert
# notebook
jsonpointer==2.3
jsonpointer==2.4
# via jsonschema
jsonschema[format-nongpl]==4.17.3
# via
@ -132,7 +132,7 @@ jupyter-client==8.2.0
# qtconsole
jupyter-console==6.6.3
# via jupyter
jupyter-core==5.3.0
jupyter-core==5.3.1
# via
# -c requirements/constraints.in
# ipykernel
@ -165,13 +165,13 @@ matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mistune==2.0.5
mistune==3.0.1
# via nbconvert
nbclassic==1.0.0
# via notebook
nbclient==0.8.0
# via nbconvert
nbconvert==7.5.0
nbconvert==7.6.0
# via
# jupyter
# jupyter-server
@ -219,7 +219,7 @@ pip-tools==6.13.0
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==3.5.3
platformdirs==3.6.0
# via
# -c requirements/test.txt
# jupyter-core
@ -359,7 +359,7 @@ typing-extensions==4.6.3
# ipython
uri-template==1.2.0
# via jsonschema
virtualenv==20.23.0
virtualenv==20.23.1
# via pre-commit
wcwidth==0.2.6
# via prompt-toolkit
@ -369,7 +369,7 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
websocket-client==1.5.3
websocket-client==1.6.0
# via jupyter-server
wheel==0.40.0
# via

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
gcsfs
fsspec

105
requirements/ingest-gcs.txt Normal file
View File

@ -0,0 +1,105 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-gcs.in
#
aiohttp==3.8.4
# via gcsfs
aiosignal==1.3.1
# via aiohttp
async-timeout==4.0.2
# via aiohttp
attrs==23.1.0
# via aiohttp
cachetools==5.3.1
# via google-auth
certifi==2023.5.7
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.1.0
# via
# -c requirements/base.txt
# aiohttp
# requests
decorator==5.1.1
# via gcsfs
frozenlist==1.3.3
# via
# aiohttp
# aiosignal
fsspec==2023.6.0
# via
# -r requirements/ingest-gcs.in
# gcsfs
gcsfs==2023.6.0
# via -r requirements/ingest-gcs.in
google-api-core==2.11.1
# via
# google-cloud-core
# google-cloud-storage
google-auth==2.20.0
# via
# gcsfs
# google-api-core
# google-auth-oauthlib
# google-cloud-core
# google-cloud-storage
google-auth-oauthlib==1.0.0
# via gcsfs
google-cloud-core==2.3.2
# via google-cloud-storage
google-cloud-storage==2.9.0
# via gcsfs
google-crc32c==1.5.0
# via google-resumable-media
google-resumable-media==2.5.0
# via google-cloud-storage
googleapis-common-protos==1.59.1
# via google-api-core
idna==3.4
# via
# -c requirements/base.txt
# requests
# yarl
multidict==6.0.4
# via
# aiohttp
# yarl
oauthlib==3.2.2
# via requests-oauthlib
protobuf==3.20.3
# via
# -c requirements/constraints.in
# google-api-core
pyasn1==0.5.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.3.0
# via google-auth
requests==2.31.0
# via
# -c requirements/base.txt
# gcsfs
# google-api-core
# google-cloud-storage
# requests-oauthlib
requests-oauthlib==1.3.1
# via google-auth-oauthlib
rsa==4.9
# via google-auth
six==1.16.0
# via
# -c requirements/base.txt
# google-auth
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# google-auth
# requests
yarl==1.9.2
# via aiohttp

View File

@ -15,9 +15,9 @@ charset-normalizer==3.1.0
# via
# -c requirements/base.txt
# requests
google-api-core==2.11.0
google-api-core==2.11.1
# via google-api-python-client
google-api-python-client==2.89.0
google-api-python-client==2.90.0
# via -r requirements/ingest-google-drive.in
google-auth==2.20.0
# via
@ -47,7 +47,7 @@ pyasn1==0.5.0
# rsa
pyasn1-modules==0.3.0
# via google-auth
pyparsing==3.0.9
pyparsing==3.1.0
# via httplib2
requests==2.31.0
# via

View File

@ -33,5 +33,5 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
websocket-client==1.5.3
websocket-client==1.6.0
# via praw

View File

@ -87,7 +87,7 @@ numpy==1.23.5
# transformers
omegaconf==2.3.0
# via effdet
onnxruntime==1.15.0
onnxruntime==1.15.1
# via unstructured-inference
opencv-python==4.7.0.72
# via
@ -136,7 +136,7 @@ pycparser==2.21
# via
# -c requirements/base.txt
# cffi
pyparsing==3.0.9
pyparsing==3.1.0
# via matplotlib
pytesseract==0.3.10
# via layoutparser

View File

@ -56,7 +56,7 @@ mccabe==0.7.0
# via flake8
multidict==6.0.4
# via yarl
mypy==1.3.0
mypy==1.4.0
# via -r requirements/test.in
mypy-extensions==1.0.0
# via
@ -69,7 +69,7 @@ packaging==23.1
# pytest
pathspec==0.11.1
# via black
platformdirs==3.5.3
platformdirs==3.6.0
# via black
pluggy==1.0.0
# via pytest
@ -87,7 +87,7 @@ pytest==7.3.2
# pytest-mock
pytest-cov==4.1.0
# via -r requirements/test.in
pytest-mock==3.10.0
pytest-mock==3.11.1
# via -r requirements/test.in
python-dateutil==2.8.2
# via
@ -99,7 +99,7 @@ requests==2.31.0
# via
# -c requirements/base.txt
# label-studio-sdk
ruff==0.0.272
ruff==0.0.273
# via -r requirements/test.in
six==1.16.0
# via

View File

@ -81,6 +81,7 @@ setup(
"slack": load_requirements("requirements/ingest-slack.in"),
"wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
"google-drive": load_requirements("requirements/ingest-google-drive.in"),
"gcs": load_requirements("requirements/ingest-gcs.in"),
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,56 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,56 @@
[
{
"type": "NarrativeText",
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "This is a test document to use for unit tests."
},
{
"type": "Address",
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Doylestown, PA 18901"
},
{
"type": "Title",
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Important points:"
},
{
"type": "ListItem",
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Hamburgers are delicious"
},
{
"type": "ListItem",
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "Dogs are the best"
},
{
"type": "ListItem",
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "I love fuzzy blankets"
}
]

View File

@ -0,0 +1,12 @@
[
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -0,0 +1,26 @@
[
{
"type": "Table",
"element_id": "f8db6c6e535705336195aa2c1d23d414",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n"
},
{
"type": "Table",
"element_id": "20f5163a43ac6eb04a40d269d3ad0663",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n"
}
]

View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/.. || exit 1
if [[ "$(find test_unstructured_ingest/expected-structured-output/gcs/ -type f -size +1 | wc -l)" -ne 6 ]]; then
echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small."
echo "Did you overwrite test fixtures with bad outputs?"
exit 1
fi
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0
fi
# Create a temporary file
GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE"
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--remote-url gs://utic-test-ingest-fixtures/ \
--structured-output-dir gcs-output \
--gcs-token "$GCP_INGEST_SERVICE_KEY_FILE" \
--recursive \
--preserve-downloads \
--reprocess
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
set +e
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
EXPECTED_DIR=test_unstructured_ingest/expected-structured-output/gcs
[ -d "$EXPECTED_DIR" ] && rm -rf "$EXPECTED_DIR"
cp -R gcs-output $EXPECTED_DIR
elif ! diff -ru test_unstructured_ingest/expected-structured-output/gcs gcs-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI,"
echo
exit 1
fi

View File

@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--local-input-path files-ingest-download \
--local-recursive \
--recursive \
--local-file-glob "*.pdf" \
--structured-output-dir pdf-fast-reprocess-ingest-output \
--partition-strategy fast \

View File

@ -20,5 +20,6 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-local.sh
./test_unstructured_ingest/test-ingest-slack.sh
./test_unstructured_ingest/test-ingest-against-api.sh
./test_unstructured_ingest/test-ingest-gcs.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh

View File

@ -1 +1 @@
__version__ = "0.7.7" # pragma: no cover
__version__ = "0.7.8-dev0" # pragma: no cover

View File

@ -15,8 +15,11 @@ from unstructured.ingest.logger import logger
SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
"s3",
"s3a",
"abfs",
"az",
"gs",
"gcs",
]
@ -24,6 +27,7 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
class SimpleFsspecConfig(BaseConnectorConfig):
# fsspec specific options
path: str
recursive: bool
access_kwargs: dict = field(default_factory=dict)
protocol: str = field(init=False)
path_without_protocol: str = field(init=False)
@ -81,7 +85,9 @@ class FsspecIngestDoc(BaseIngestDoc):
def has_output(self):
"""Determine if structured output for this doc already exists."""
return self._output_filename().is_file() and os.path.getsize(self._output_filename())
return self._output_filename().is_file() and os.path.getsize(
self._output_filename(),
)
def _create_full_tmp_dir_path(self):
"""Includes "directories" in the object path"""
@ -114,19 +120,24 @@ class FsspecIngestDoc(BaseIngestDoc):
output_filename = self._output_filename()
output_filename.parent.mkdir(parents=True, exist_ok=True)
with open(output_filename, "w") as output_f:
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
output_f.write(
json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2),
)
logger.info(f"Wrote {output_filename}")
@property
def filename(self):
"""The filename of the file after downloading from s3"""
"""The filename of the file after downloading from cloud"""
return self._tmp_download_file()
def cleanup_file(self):
"""Removes the local copy of the file after successful processing."""
if not self.standard_config.preserve_downloads and not self.standard_config.download_only:
logger.debug(f"Cleaning up {self}")
os.unlink(self._tmp_download_file())
try:
os.unlink(self._tmp_download_file())
except OSError as e: # Don't think we need to raise an exception
logger.debug(f"Failed to remove {self._tmp_download_file()} due to {e}")
class FsspecConnector(BaseConnector):
@ -151,7 +162,7 @@ class FsspecConnector(BaseConnector):
)
def cleanup(self, cur_dir=None):
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files
"""cleanup linginering empty sub-dirs from cloud paths, but leave remaining files
(and their paths) in tact as that indicates they were not processed"""
if not self.cleanup_files:
return
@ -177,7 +188,26 @@ class FsspecConnector(BaseConnector):
)
def _list_files(self):
return self.fs.ls(self.config.path_without_protocol)
if not self.config.recursive:
# fs.ls does not walk directories
# directories that are listed in cloud storage can cause problems
# because they are seen as 0 byte files
return [
x.get("name")
for x in self.fs.ls(self.config.path_without_protocol, detail=True)
if x.get("size") > 0
]
else:
# fs.find will recursively walk directories
# "size" is a common key for all the cloud protocols with fs
return [
k
for k, v in self.fs.find(
self.config.path_without_protocol,
detail=True,
).items()
if v.get("size") > 0
]
def get_ingest_docs(self):
return [

View File

@ -0,0 +1,33 @@
from dataclasses import dataclass
from typing import Type
from unstructured.ingest.connector.fsspec import (
FsspecConnector,
FsspecIngestDoc,
SimpleFsspecConfig,
)
from unstructured.ingest.interfaces import StandardConnectorConfig
from unstructured.utils import requires_dependencies
@dataclass
class SimpleGcsConfig(SimpleFsspecConfig):
pass
class GcsIngestDoc(FsspecIngestDoc):
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
def get_file(self):
super().get_file()
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
class GcsConnector(FsspecConnector):
ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
def __init__(
self,
config: SimpleGcsConfig,
standard_config: StandardConnectorConfig,
) -> None:
super().__init__(standard_config, config)

View File

@ -161,12 +161,6 @@ class MainProcess:
default=None,
help="Path to the location in the local file system that will be processed.",
)
@click.option(
"--local-recursive",
is_flag=True,
default=False,
help="Support recursive local file processing.",
)
@click.option(
"--local-file-glob",
default=None,
@ -177,7 +171,14 @@ class MainProcess:
"--remote-url",
default=None,
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
"a directory or a single file. Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
"and `az`.",
)
@click.option(
"--gcs-token",
default=None,
help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds"
"or get creds from the google metadata service or fall back to anonymous access.",
)
@click.option(
"--s3-anonymous",
@ -211,13 +212,6 @@ class MainProcess:
default=None,
help="Path to the Google Drive service account json file.",
)
@click.option(
"--drive-recursive",
is_flag=True,
default=False,
help="Recursively download files in folders from the Google Drive ID, "
"otherwise stop at the files in provided folder level.",
)
@click.option(
"--drive-extension",
default=None,
@ -412,17 +406,26 @@ class MainProcess:
show_default=True,
help="Number of parallel processes to process docs in.",
)
@click.option(
"--recursive",
is_flag=True,
default=False,
help="Recursively download files in their respective folders"
"otherwise stop at the files in provided folder level."
" Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
"`az`, `google drive` and `local`.",
)
@click.option("-v", "--verbose", is_flag=True, default=False)
def main(
ctx,
remote_url,
s3_anonymous,
gcs_token,
azure_account_name,
azure_account_key,
azure_connection_string,
drive_id,
drive_service_account_key,
drive_recursive,
drive_extension,
biomed_path,
biomed_api_id,
@ -457,6 +460,7 @@ def main(
structured_output_dir,
reprocess,
num_processes,
recursive,
verbose,
metadata_include,
metadata_exclude,
@ -468,7 +472,6 @@ def main(
partition_strategy,
api_key,
local_input_path,
local_recursive,
local_file_glob,
download_only,
):
@ -580,9 +583,21 @@ def main(
standard_config=standard_config,
config=SimpleS3Config(
path=remote_url,
recursive=recursive,
access_kwargs={"anon": s3_anonymous},
),
)
elif protocol in ("gs", "gcs"):
from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig
doc_connector = GcsConnector( # type: ignore
standard_config=standard_config,
config=SimpleGcsConfig(
path=remote_url,
recursive=recursive,
access_kwargs={"token": gcs_token},
),
)
elif protocol in ("abfs", "az"):
from unstructured.ingest.connector.azure import (
AzureBlobStorageConnector,
@ -602,14 +617,15 @@ def main(
standard_config=standard_config,
config=SimpleAzureBlobStorageConfig(
path=remote_url,
recursive=recursive,
access_kwargs=access_kwargs,
),
)
else:
warnings.warn(
f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
" so use it at your own risk. Supported protocols are `s3`, `s3a`, `abfs`,"
" and `az`.",
" so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
"`abfs` and `az`.",
UserWarning,
)
@ -622,6 +638,7 @@ def main(
standard_config=standard_config,
config=SimpleFsspecConfig(
path=remote_url,
recursive=recursive,
),
)
elif github_url:
@ -721,7 +738,7 @@ def main(
config=SimpleGoogleDriveConfig(
drive_id=drive_id,
service_account_key=drive_service_account_key,
recursive=drive_recursive,
recursive=recursive,
extension=drive_extension,
),
)
@ -753,7 +770,7 @@ def main(
standard_config=standard_config,
config=SimpleLocalConfig(
input_path=local_input_path,
recursive=local_recursive,
recursive=recursive,
file_glob=local_file_glob,
),
)

View File

@ -35,7 +35,7 @@ def requires_dependencies(
raise ImportError(
f"Following dependencies are missing: {', '.join(missing_deps)}. "
+ (
f"Please install them using `pip install unstructured[{extras}]`."
f"""Please install them using `pip install "unstructured[{extras}]"`."""
if extras
else f"Please install them using `pip install {' '.join(missing_deps)}`."
),