From 3b472cb7dfbc6926cf9511a9845f3a615ccb0269 Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 21 Jun 2023 15:14:50 -0700 Subject: [PATCH] feat: add google cloud storage connector (#746) --- .github/workflows/ci.yml | 1 + .../ingest-test-fixtures-update-pr.yml | 1 + CHANGELOG.md | 13 +++ MANIFEST.in | 1 + Makefile | 5 + docs/requirements.txt | 2 +- examples/ingest/azure/ingest.sh | 2 +- .../ingest/google_cloud_storage/ingest.sh | 16 +++ examples/ingest/google_drive/ingest.sh | 2 +- examples/ingest/local/ingest.sh | 2 +- requirements/base.txt | 4 +- requirements/build.txt | 2 +- requirements/dev.txt | 16 +-- requirements/ingest-gcs.in | 4 + requirements/ingest-gcs.txt | 105 ++++++++++++++++++ requirements/ingest-google-drive.txt | 6 +- requirements/ingest-reddit.txt | 2 +- requirements/local-inference.txt | 4 +- requirements/test.txt | 8 +- setup.py | 1 + .../gcs/ideas-page.html.json | 12 ++ .../gcs/nested-1/fake-text.txt.json | 56 ++++++++++ .../gcs/nested-1/nested/ideas-page.html.json | 12 ++ .../gcs/nested-2/fake-text.txt.json | 56 ++++++++++ .../gcs/nested-2/nested/ideas-page.html.json | 12 ++ .../gcs/nested-2/stanley-cups.xlsx.json | 26 +++++ test_unstructured_ingest/test-ingest-gcs.sh | 58 ++++++++++ .../test-ingest-pdf-fast-reprocess.sh | 2 +- test_unstructured_ingest/test-ingest.sh | 1 + unstructured/__version__.py | 2 +- unstructured/ingest/connector/fsspec.py | 42 ++++++- unstructured/ingest/connector/gcs.py | 33 ++++++ unstructured/ingest/main.py | 57 ++++++---- unstructured/utils.py | 2 +- 34 files changed, 514 insertions(+), 54 deletions(-) mode change 100644 => 100755 examples/ingest/azure/ingest.sh create mode 100755 examples/ingest/google_cloud_storage/ingest.sh mode change 100644 => 100755 examples/ingest/local/ingest.sh create mode 100644 requirements/ingest-gcs.in create mode 100644 requirements/ingest-gcs.txt create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json create mode 100644 test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json create mode 100755 test_unstructured_ingest/test-ingest-gcs.sh create mode 100644 unstructured/ingest/connector/gcs.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 710ef8a7b..430f3a528 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -195,6 +195,7 @@ jobs: make install-ingest-s3 make install-ingest-azure make install-ingest-discord + make install-ingest-gcs make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 295b257b6..9b5d4464f 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -73,6 +73,7 @@ jobs: make install-ingest-s3 make install-ingest-azure make install-ingest-discord + make install-ingest-gcs make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab diff --git a/CHANGELOG.md b/CHANGELOG.md index af8a57a90..5d9ec4f3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## 0.7.8-dev0 + +### Enhancements + +* Adds recursive functionality to all fsspec connectors +* Adds generic --recursive ingest flag + +### Features + +* Adds Google Cloud Service connector + +### Fixes + ## 0.7.7 ### Enhancements diff --git a/MANIFEST.in b/MANIFEST.in index 9e1b6a71f..d8403bab2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,7 @@ include requirements/base.in include requirements/huggingface.in include requirements/local-inference.in include requirements/ingest-s3.in +include requirements/ingest-gcs.in include requirements/ingest-azure.in include requirements/ingest-discord.in include requirements/ingest-github.in diff --git a/Makefile b/Makefile index b0448be11..72936a256 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,10 @@ install-ingest-google-drive: install-ingest-s3: python3 -m pip install -r requirements/ingest-s3.txt +.PHONY: install-ingest-gcs +install-ingest-gcs: + python3 -m pip install -r requirements/ingest-gcs.txt + .PHONY: install-ingest-azure install-ingest-azure: python3 -m pip install -r requirements/ingest-azure.txt @@ -117,6 +121,7 @@ pip-compile: # sphinx docs looks for additional requirements cp requirements/build.txt docs/requirements.txt pip-compile --upgrade requirements/ingest-s3.in + pip-compile --upgrade requirements/ingest-gcs.in pip-compile --upgrade requirements/ingest-azure.in pip-compile --upgrade requirements/ingest-discord.in pip-compile --upgrade requirements/ingest-reddit.in diff --git a/docs/requirements.txt b/docs/requirements.txt index 26a41f3e3..eff9ce3e6 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -26,7 +26,7 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.6.0 +importlib-metadata==6.7.0 # via sphinx jinja2==3.1.2 # via sphinx diff --git a/examples/ingest/azure/ingest.sh b/examples/ingest/azure/ingest.sh old mode 100644 new mode 100755 index b75c216f0..37ff75224 --- a/examples/ingest/azure/ingest.sh +++ b/examples/ingest/azure/ingest.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Processes all the files from abfs://container1/ in azureunstructured1 account, -# using the `unstructured` library. +# using the `unstructured` library. # Structured outputs are stored in azure-ingest-output/ diff --git a/examples/ingest/google_cloud_storage/ingest.sh b/examples/ingest/google_cloud_storage/ingest.sh new file mode 100755 index 000000000..b16b9578e --- /dev/null +++ b/examples/ingest/google_cloud_storage/ingest.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/ +# through Unstructured's library in 2 processes. + +# Structured outputs are stored in gcs-output/ + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "$SCRIPT_DIR"/../../.. || exit 1 + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --remote-url gs://utic-test-ingest-fixtures-public/ \ + --structured-output-dir gcs-output \ + --num-processes 2 \ + --recursive \ + --verbose \ No newline at end of file diff --git a/examples/ingest/google_drive/ingest.sh b/examples/ingest/google_drive/ingest.sh index d1f63f5f9..f5fe9cccc 100644 --- a/examples/ingest/google_drive/ingest.sh +++ b/examples/ingest/google_drive/ingest.sh @@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --drive-service-account-key "" \ --structured-output-dir google-drive-ingest-output \ --num-processes 2 \ - --drive-recursive \ + --recursive \ --verbose \ # --extension ".docx" # Ensures only .docx files are processed. diff --git a/examples/ingest/local/ingest.sh b/examples/ingest/local/ingest.sh old mode 100644 new mode 100755 index a46ace010..d5d47b93e --- a/examples/ingest/local/ingest.sh +++ b/examples/ingest/local/ingest.sh @@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --local-input-path example-docs \ --structured-output-dir local-ingest-output \ --num-processes 2 \ - --local-recursive \ + --recursive \ --verbose \ # Alternatively, you can call it using: diff --git a/requirements/base.txt b/requirements/base.txt index 5532377b2..50e1240be 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -6,7 +6,7 @@ # anyio==3.7.0 # via httpcore -argilla==1.9.0 +argilla==1.10.0 # via -r requirements/base.in backoff==2.2.1 # via argilla @@ -51,7 +51,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.6.0 +importlib-metadata==6.7.0 # via markdown joblib==1.2.0 # via nltk diff --git a/requirements/build.txt b/requirements/build.txt index 26a41f3e3..eff9ce3e6 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -26,7 +26,7 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.6.0 +importlib-metadata==6.7.0 # via sphinx jinja2==3.1.2 # via sphinx diff --git a/requirements/dev.txt b/requirements/dev.txt index f60d75626..7d61f0365 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -75,7 +75,7 @@ idna==3.4 # -c requirements/test.txt # anyio # jsonschema -importlib-metadata==6.6.0 +importlib-metadata==6.7.0 # via # -c requirements/base.txt # jupyter-client @@ -113,7 +113,7 @@ jinja2==3.1.2 # nbclassic # nbconvert # notebook -jsonpointer==2.3 +jsonpointer==2.4 # via jsonschema jsonschema[format-nongpl]==4.17.3 # via @@ -132,7 +132,7 @@ jupyter-client==8.2.0 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.3.0 +jupyter-core==5.3.1 # via # -c requirements/constraints.in # ipykernel @@ -165,13 +165,13 @@ matplotlib-inline==0.1.6 # via # ipykernel # ipython -mistune==2.0.5 +mistune==3.0.1 # via nbconvert nbclassic==1.0.0 # via notebook nbclient==0.8.0 # via nbconvert -nbconvert==7.5.0 +nbconvert==7.6.0 # via # jupyter # jupyter-server @@ -219,7 +219,7 @@ pip-tools==6.13.0 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema -platformdirs==3.5.3 +platformdirs==3.6.0 # via # -c requirements/test.txt # jupyter-core @@ -359,7 +359,7 @@ typing-extensions==4.6.3 # ipython uri-template==1.2.0 # via jsonschema -virtualenv==20.23.0 +virtualenv==20.23.1 # via pre-commit wcwidth==0.2.6 # via prompt-toolkit @@ -369,7 +369,7 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.5.3 +websocket-client==1.6.0 # via jupyter-server wheel==0.40.0 # via diff --git a/requirements/ingest-gcs.in b/requirements/ingest-gcs.in new file mode 100644 index 000000000..8c527ae9f --- /dev/null +++ b/requirements/ingest-gcs.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt +gcsfs +fsspec diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt new file mode 100644 index 000000000..c83f4968f --- /dev/null +++ b/requirements/ingest-gcs.txt @@ -0,0 +1,105 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements/ingest-gcs.in +# +aiohttp==3.8.4 + # via gcsfs +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.2 + # via aiohttp +attrs==23.1.0 + # via aiohttp +cachetools==5.3.1 + # via google-auth +certifi==2023.5.7 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +charset-normalizer==3.1.0 + # via + # -c requirements/base.txt + # aiohttp + # requests +decorator==5.1.1 + # via gcsfs +frozenlist==1.3.3 + # via + # aiohttp + # aiosignal +fsspec==2023.6.0 + # via + # -r requirements/ingest-gcs.in + # gcsfs +gcsfs==2023.6.0 + # via -r requirements/ingest-gcs.in +google-api-core==2.11.1 + # via + # google-cloud-core + # google-cloud-storage +google-auth==2.20.0 + # via + # gcsfs + # google-api-core + # google-auth-oauthlib + # google-cloud-core + # google-cloud-storage +google-auth-oauthlib==1.0.0 + # via gcsfs +google-cloud-core==2.3.2 + # via google-cloud-storage +google-cloud-storage==2.9.0 + # via gcsfs +google-crc32c==1.5.0 + # via google-resumable-media +google-resumable-media==2.5.0 + # via google-cloud-storage +googleapis-common-protos==1.59.1 + # via google-api-core +idna==3.4 + # via + # -c requirements/base.txt + # requests + # yarl +multidict==6.0.4 + # via + # aiohttp + # yarl +oauthlib==3.2.2 + # via requests-oauthlib +protobuf==3.20.3 + # via + # -c requirements/constraints.in + # google-api-core +pyasn1==0.5.0 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 + # via google-auth +requests==2.31.0 + # via + # -c requirements/base.txt + # gcsfs + # google-api-core + # google-cloud-storage + # requests-oauthlib +requests-oauthlib==1.3.1 + # via google-auth-oauthlib +rsa==4.9 + # via google-auth +six==1.16.0 + # via + # -c requirements/base.txt + # google-auth +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # google-auth + # requests +yarl==1.9.2 + # via aiohttp diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index ceb2a16dc..783c50726 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -15,9 +15,9 @@ charset-normalizer==3.1.0 # via # -c requirements/base.txt # requests -google-api-core==2.11.0 +google-api-core==2.11.1 # via google-api-python-client -google-api-python-client==2.89.0 +google-api-python-client==2.90.0 # via -r requirements/ingest-google-drive.in google-auth==2.20.0 # via @@ -47,7 +47,7 @@ pyasn1==0.5.0 # rsa pyasn1-modules==0.3.0 # via google-auth -pyparsing==3.0.9 +pyparsing==3.1.0 # via httplib2 requests==2.31.0 # via diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index ae4bfadfa..06003c883 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -33,5 +33,5 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests -websocket-client==1.5.3 +websocket-client==1.6.0 # via praw diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt index 638b6f2b5..18273fbca 100644 --- a/requirements/local-inference.txt +++ b/requirements/local-inference.txt @@ -87,7 +87,7 @@ numpy==1.23.5 # transformers omegaconf==2.3.0 # via effdet -onnxruntime==1.15.0 +onnxruntime==1.15.1 # via unstructured-inference opencv-python==4.7.0.72 # via @@ -136,7 +136,7 @@ pycparser==2.21 # via # -c requirements/base.txt # cffi -pyparsing==3.0.9 +pyparsing==3.1.0 # via matplotlib pytesseract==0.3.10 # via layoutparser diff --git a/requirements/test.txt b/requirements/test.txt index 390db3130..d5467cf02 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -56,7 +56,7 @@ mccabe==0.7.0 # via flake8 multidict==6.0.4 # via yarl -mypy==1.3.0 +mypy==1.4.0 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -69,7 +69,7 @@ packaging==23.1 # pytest pathspec==0.11.1 # via black -platformdirs==3.5.3 +platformdirs==3.6.0 # via black pluggy==1.0.0 # via pytest @@ -87,7 +87,7 @@ pytest==7.3.2 # pytest-mock pytest-cov==4.1.0 # via -r requirements/test.in -pytest-mock==3.10.0 +pytest-mock==3.11.1 # via -r requirements/test.in python-dateutil==2.8.2 # via @@ -99,7 +99,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.272 +ruff==0.0.273 # via -r requirements/test.in six==1.16.0 # via diff --git a/setup.py b/setup.py index d01619940..e56ee3df6 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,7 @@ setup( "slack": load_requirements("requirements/ingest-slack.in"), "wikipedia": load_requirements("requirements/ingest-wikipedia.in"), "google-drive": load_requirements("requirements/ingest-google-drive.in"), + "gcs": load_requirements("requirements/ingest-gcs.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json new file mode 100644 index 000000000..695e7dc7f --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -0,0 +1,12 @@ +[ + { + "type": "NarrativeText", + "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json new file mode 100644 index 000000000..b9d4291b7 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json @@ -0,0 +1,56 @@ +[ + { + "type": "NarrativeText", + "element_id": "1df8eeb8be847c3a1a7411e3be3e0396", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "This is a test document to use for unit tests." + }, + { + "type": "Address", + "element_id": "a9d4657034aa3fdb5177f1325e912362", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Doylestown, PA 18901" + }, + { + "type": "Title", + "element_id": "9c218520320f238595f1fde74bdd137d", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Important points:" + }, + { + "type": "ListItem", + "element_id": "39a3ae572581d0f1fe7511fd7b3aa414", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Hamburgers are delicious" + }, + { + "type": "ListItem", + "element_id": "fc1adcb8eaceac694e500a103f9f698f", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Dogs are the best" + }, + { + "type": "ListItem", + "element_id": "0b61e826b1c4ab05750184da72b89f83", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "I love fuzzy blankets" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json new file mode 100644 index 000000000..695e7dc7f --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -0,0 +1,12 @@ +[ + { + "type": "NarrativeText", + "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json new file mode 100644 index 000000000..b9d4291b7 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json @@ -0,0 +1,56 @@ +[ + { + "type": "NarrativeText", + "element_id": "1df8eeb8be847c3a1a7411e3be3e0396", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "This is a test document to use for unit tests." + }, + { + "type": "Address", + "element_id": "a9d4657034aa3fdb5177f1325e912362", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Doylestown, PA 18901" + }, + { + "type": "Title", + "element_id": "9c218520320f238595f1fde74bdd137d", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Important points:" + }, + { + "type": "ListItem", + "element_id": "39a3ae572581d0f1fe7511fd7b3aa414", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Hamburgers are delicious" + }, + { + "type": "ListItem", + "element_id": "fc1adcb8eaceac694e500a103f9f698f", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "Dogs are the best" + }, + { + "type": "ListItem", + "element_id": "0b61e826b1c4ab05750184da72b89f83", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + }, + "text": "I love fuzzy blankets" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json new file mode 100644 index 000000000..695e7dc7f --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -0,0 +1,12 @@ +[ + { + "type": "NarrativeText", + "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json new file mode 100644 index 000000000..bc1a88f3b --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -0,0 +1,26 @@ +[ + { + "type": "Table", + "element_id": "f8db6c6e535705336195aa2c1d23d414", + "metadata": { + "data_source": {}, + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "page_number": 1, + "page_name": "Stanley Cups", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n" + }, + { + "type": "Table", + "element_id": "20f5163a43ac6eb04a40d269d3ad0663", + "metadata": { + "data_source": {}, + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "page_number": 2, + "page_name": "Stanley Cups Since 67", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + }, + "text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh new file mode 100755 index 000000000..1049d45c2 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +cd "$SCRIPT_DIR"/.. || exit 1 + +if [[ "$(find test_unstructured_ingest/expected-structured-output/gcs/ -type f -size +1 | wc -l)" -ne 6 ]]; then + echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small." + echo "Did you overwrite test fixtures with bad outputs?" + exit 1 +fi + +if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then + echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." + exit 0 +fi + +# Create a temporary file +GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" + + +PYTHONPATH=. ./unstructured/ingest/main.py \ + --metadata-exclude filename,file_directory,metadata.data_source.date_processed \ + --remote-url gs://utic-test-ingest-fixtures/ \ + --structured-output-dir gcs-output \ + --gcs-token "$GCP_INGEST_SERVICE_KEY_FILE" \ + --recursive \ + --preserve-downloads \ + --reprocess + +OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} + +set +e + +# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64 +if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then + EXPECTED_DIR=test_unstructured_ingest/expected-structured-output/gcs + [ -d "$EXPECTED_DIR" ] && rm -rf "$EXPECTED_DIR" + cp -R gcs-output $EXPECTED_DIR + +elif ! diff -ru test_unstructured_ingest/expected-structured-output/gcs gcs-output ; then + echo + echo "There are differences from the previously checked-in structured outputs." + echo + echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:" + echo + echo " export OVERWRITE_FIXTURES=true" + echo + echo "and then rerun this script." + echo + echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware" + echo "to update fixtures for CI," + echo + exit 1 + +fi diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index 41845e8f1..dd13750e7 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 PYTHONPATH=. ./unstructured/ingest/main.py \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed \ --local-input-path files-ingest-download \ - --local-recursive \ + --recursive \ --local-file-glob "*.pdf" \ --structured-output-dir pdf-fast-reprocess-ingest-output \ --partition-strategy fast \ diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index ff0569d93..8c41cd97e 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -20,5 +20,6 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-local.sh ./test_unstructured_ingest/test-ingest-slack.sh ./test_unstructured_ingest/test-ingest-against-api.sh +./test_unstructured_ingest/test-ingest-gcs.sh # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh diff --git a/unstructured/__version__.py b/unstructured/__version__.py index bd4e32c66..02115ecae 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.7" # pragma: no cover +__version__ = "0.7.8-dev0" # pragma: no cover diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 07630efc3..9bd03d776 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -15,8 +15,11 @@ from unstructured.ingest.logger import logger SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ "s3", + "s3a", "abfs", "az", + "gs", + "gcs", ] @@ -24,6 +27,7 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ class SimpleFsspecConfig(BaseConnectorConfig): # fsspec specific options path: str + recursive: bool access_kwargs: dict = field(default_factory=dict) protocol: str = field(init=False) path_without_protocol: str = field(init=False) @@ -81,7 +85,9 @@ class FsspecIngestDoc(BaseIngestDoc): def has_output(self): """Determine if structured output for this doc already exists.""" - return self._output_filename().is_file() and os.path.getsize(self._output_filename()) + return self._output_filename().is_file() and os.path.getsize( + self._output_filename(), + ) def _create_full_tmp_dir_path(self): """Includes "directories" in the object path""" @@ -114,19 +120,24 @@ class FsspecIngestDoc(BaseIngestDoc): output_filename = self._output_filename() output_filename.parent.mkdir(parents=True, exist_ok=True) with open(output_filename, "w") as output_f: - output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2)) + output_f.write( + json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2), + ) logger.info(f"Wrote {output_filename}") @property def filename(self): - """The filename of the file after downloading from s3""" + """The filename of the file after downloading from cloud""" return self._tmp_download_file() def cleanup_file(self): """Removes the local copy of the file after successful processing.""" if not self.standard_config.preserve_downloads and not self.standard_config.download_only: logger.debug(f"Cleaning up {self}") - os.unlink(self._tmp_download_file()) + try: + os.unlink(self._tmp_download_file()) + except OSError as e: # Don't think we need to raise an exception + logger.debug(f"Failed to remove {self._tmp_download_file()} due to {e}") class FsspecConnector(BaseConnector): @@ -151,7 +162,7 @@ class FsspecConnector(BaseConnector): ) def cleanup(self, cur_dir=None): - """cleanup linginering empty sub-dirs from s3 paths, but leave remaining files + """cleanup linginering empty sub-dirs from cloud paths, but leave remaining files (and their paths) in tact as that indicates they were not processed""" if not self.cleanup_files: return @@ -177,7 +188,26 @@ class FsspecConnector(BaseConnector): ) def _list_files(self): - return self.fs.ls(self.config.path_without_protocol) + if not self.config.recursive: + # fs.ls does not walk directories + # directories that are listed in cloud storage can cause problems + # because they are seen as 0 byte files + return [ + x.get("name") + for x in self.fs.ls(self.config.path_without_protocol, detail=True) + if x.get("size") > 0 + ] + else: + # fs.find will recursively walk directories + # "size" is a common key for all the cloud protocols with fs + return [ + k + for k, v in self.fs.find( + self.config.path_without_protocol, + detail=True, + ).items() + if v.get("size") > 0 + ] def get_ingest_docs(self): return [ diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py new file mode 100644 index 000000000..256934b07 --- /dev/null +++ b/unstructured/ingest/connector/gcs.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass +from typing import Type + +from unstructured.ingest.connector.fsspec import ( + FsspecConnector, + FsspecIngestDoc, + SimpleFsspecConfig, +) +from unstructured.ingest.interfaces import StandardConnectorConfig +from unstructured.utils import requires_dependencies + + +@dataclass +class SimpleGcsConfig(SimpleFsspecConfig): + pass + + +class GcsIngestDoc(FsspecIngestDoc): + @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") + def get_file(self): + super().get_file() + + +@requires_dependencies(["gcsfs", "fsspec"], extras="gcs") +class GcsConnector(FsspecConnector): + ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc + + def __init__( + self, + config: SimpleGcsConfig, + standard_config: StandardConnectorConfig, + ) -> None: + super().__init__(standard_config, config) diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py index 7f612b2a0..ba37f2ea4 100755 --- a/unstructured/ingest/main.py +++ b/unstructured/ingest/main.py @@ -161,12 +161,6 @@ class MainProcess: default=None, help="Path to the location in the local file system that will be processed.", ) -@click.option( - "--local-recursive", - is_flag=True, - default=False, - help="Support recursive local file processing.", -) @click.option( "--local-file-glob", default=None, @@ -177,7 +171,14 @@ class MainProcess: "--remote-url", default=None, help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both " - "a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.", + "a directory or a single file. Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` " + "and `az`.", +) +@click.option( + "--gcs-token", + default=None, + help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds" + "or get creds from the google metadata service or fall back to anonymous access.", ) @click.option( "--s3-anonymous", @@ -211,13 +212,6 @@ class MainProcess: default=None, help="Path to the Google Drive service account json file.", ) -@click.option( - "--drive-recursive", - is_flag=True, - default=False, - help="Recursively download files in folders from the Google Drive ID, " - "otherwise stop at the files in provided folder level.", -) @click.option( "--drive-extension", default=None, @@ -412,17 +406,26 @@ class MainProcess: show_default=True, help="Number of parallel processes to process docs in.", ) +@click.option( + "--recursive", + is_flag=True, + default=False, + help="Recursively download files in their respective folders" + "otherwise stop at the files in provided folder level." + " Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` " + "`az`, `google drive` and `local`.", +) @click.option("-v", "--verbose", is_flag=True, default=False) def main( ctx, remote_url, s3_anonymous, + gcs_token, azure_account_name, azure_account_key, azure_connection_string, drive_id, drive_service_account_key, - drive_recursive, drive_extension, biomed_path, biomed_api_id, @@ -457,6 +460,7 @@ def main( structured_output_dir, reprocess, num_processes, + recursive, verbose, metadata_include, metadata_exclude, @@ -468,7 +472,6 @@ def main( partition_strategy, api_key, local_input_path, - local_recursive, local_file_glob, download_only, ): @@ -580,9 +583,21 @@ def main( standard_config=standard_config, config=SimpleS3Config( path=remote_url, + recursive=recursive, access_kwargs={"anon": s3_anonymous}, ), ) + elif protocol in ("gs", "gcs"): + from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig + + doc_connector = GcsConnector( # type: ignore + standard_config=standard_config, + config=SimpleGcsConfig( + path=remote_url, + recursive=recursive, + access_kwargs={"token": gcs_token}, + ), + ) elif protocol in ("abfs", "az"): from unstructured.ingest.connector.azure import ( AzureBlobStorageConnector, @@ -602,14 +617,15 @@ def main( standard_config=standard_config, config=SimpleAzureBlobStorageConfig( path=remote_url, + recursive=recursive, access_kwargs=access_kwargs, ), ) else: warnings.warn( f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," - " so use it at your own risk. Supported protocols are `s3`, `s3a`, `abfs`," - " and `az`.", + " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`," + "`abfs` and `az`.", UserWarning, ) @@ -622,6 +638,7 @@ def main( standard_config=standard_config, config=SimpleFsspecConfig( path=remote_url, + recursive=recursive, ), ) elif github_url: @@ -721,7 +738,7 @@ def main( config=SimpleGoogleDriveConfig( drive_id=drive_id, service_account_key=drive_service_account_key, - recursive=drive_recursive, + recursive=recursive, extension=drive_extension, ), ) @@ -753,7 +770,7 @@ def main( standard_config=standard_config, config=SimpleLocalConfig( input_path=local_input_path, - recursive=local_recursive, + recursive=recursive, file_glob=local_file_glob, ), ) diff --git a/unstructured/utils.py b/unstructured/utils.py index 5d1d86e79..d5a8de434 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -35,7 +35,7 @@ def requires_dependencies( raise ImportError( f"Following dependencies are missing: {', '.join(missing_deps)}. " + ( - f"Please install them using `pip install unstructured[{extras}]`." + f"""Please install them using `pip install "unstructured[{extras}]"`.""" if extras else f"Please install them using `pip install {' '.join(missing_deps)}`." ),