feat: add google cloud storage connector (#746)

2025-06-27 02:30:08 +00:00 · 2023-06-21 15:14:50 -07:00 · 2023-06-21 15:14:50 -07:00 · 3b472cb7df
commit 3b472cb7df
parent 21c346dab8
34 changed files with 514 additions and 54 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -195,6 +195,7 @@ jobs:
        make install-ingest-s3
        make install-ingest-azure
        make install-ingest-discord
        make install-ingest-gcs
        make install-ingest-google-drive
        make install-ingest-github
        make install-ingest-gitlab
--- a/.github/workflows/ingest-test-fixtures-update-pr.yml
+++ b/.github/workflows/ingest-test-fixtures-update-pr.yml
@ -73,6 +73,7 @@ jobs:
          make install-ingest-s3
          make install-ingest-azure
          make install-ingest-discord
          make install-ingest-gcs
          make install-ingest-google-drive
          make install-ingest-github
          make install-ingest-gitlab
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,16 @@
 ## 0.7.8-dev0
 ### Enhancements
 * Adds recursive functionality to all fsspec connectors
 * Adds generic --recursive ingest flag
 ### Features
 * Adds Google Cloud Service connector
 ### Fixes
 ## 0.7.7
 ### Enhancements
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,6 +2,7 @@ include requirements/base.in
 include requirements/huggingface.in
 include requirements/local-inference.in
 include requirements/ingest-s3.in
 include requirements/ingest-gcs.in
 include requirements/ingest-azure.in
 include requirements/ingest-discord.in
 include requirements/ingest-github.in
--- a/5
+++ b/5
@ -62,6 +62,10 @@ install-ingest-google-drive:
 install-ingest-s3:
 	python3 -m pip install -r requirements/ingest-s3.txt
 .PHONY: install-ingest-gcs
 install-ingest-gcs:
 	python3 -m pip install -r requirements/ingest-gcs.txt
 .PHONY: install-ingest-azure
 install-ingest-azure:
 	python3 -m pip install -r requirements/ingest-azure.txt
@ -117,6 +121,7 @@ pip-compile:
 	# sphinx docs looks for additional requirements
 	cp requirements/build.txt docs/requirements.txt
 	pip-compile --upgrade requirements/ingest-s3.in
 	pip-compile --upgrade requirements/ingest-gcs.in
 	pip-compile --upgrade requirements/ingest-azure.in
 	pip-compile --upgrade requirements/ingest-discord.in
 	pip-compile --upgrade requirements/ingest-reddit.in
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -26,7 +26,7 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.6.0
+importlib-metadata==6.7.0
    # via sphinx
 jinja2==3.1.2
    # via sphinx
--- a/examples/ingest/azure/ingest.sh
+++ b/examples/ingest/azure/ingest.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Processes all the files from abfs://container1/ in azureunstructured1 account, 
-# using the `unstructured` library.
+# using the `unstructured` library. 
 # Structured outputs are stored in azure-ingest-output/
--- a/examples/ingest/google_cloud_storage/ingest.sh
+++ b/examples/ingest/google_cloud_storage/ingest.sh
@ -0,0 +1,16 @@
 #!/usr/bin/env bash
 # Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/ 
 # through Unstructured's library in 2 processes.
 # Structured outputs are stored in gcs-output/
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd "$SCRIPT_DIR"/../../.. || exit 1
 PYTHONPATH=. ./unstructured/ingest/main.py \
   --remote-url gs://utic-test-ingest-fixtures-public/ \
   --structured-output-dir gcs-output \
   --num-processes 2 \
   --recursive \
   --verbose 
--- a/examples/ingest/google_drive/ingest.sh
+++ b/examples/ingest/google_drive/ingest.sh
@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --drive-service-account-key "<path to drive service account key>" \
    --structured-output-dir google-drive-ingest-output \
    --num-processes 2 \
-    --drive-recursive \
+    --recursive \
    --verbose \
 #    --extension ".docx" # Ensures only .docx files are processed.
--- a/examples/ingest/local/ingest.sh
+++ b/examples/ingest/local/ingest.sh
@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --local-input-path example-docs \
    --structured-output-dir local-ingest-output \
    --num-processes 2 \
-    --local-recursive \
+    --recursive \
    --verbose \
 # Alternatively, you can call it using:
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -6,7 +6,7 @@
 #
 anyio==3.7.0
    # via httpcore
-argilla==1.9.0
+argilla==1.10.0
    # via -r requirements/base.in
 backoff==2.2.1
    # via argilla
@ -51,7 +51,7 @@ idna==3.4
    #   anyio
    #   requests
    #   rfc3986
-importlib-metadata==6.6.0
+importlib-metadata==6.7.0
    # via markdown
 joblib==1.2.0
    # via nltk
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -26,7 +26,7 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.6.0
+importlib-metadata==6.7.0
    # via sphinx
 jinja2==3.1.2
    # via sphinx
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -75,7 +75,7 @@ idna==3.4
    #   -c requirements/test.txt
    #   anyio
    #   jsonschema
-importlib-metadata==6.6.0
+importlib-metadata==6.7.0
    # via
    #   -c requirements/base.txt
    #   jupyter-client
@ -113,7 +113,7 @@ jinja2==3.1.2
    #   nbclassic
    #   nbconvert
    #   notebook
-jsonpointer==2.3
+jsonpointer==2.4
    # via jsonschema
 jsonschema[format-nongpl]==4.17.3
    # via
@ -132,7 +132,7 @@ jupyter-client==8.2.0
    #   qtconsole
 jupyter-console==6.6.3
    # via jupyter
-jupyter-core==5.3.0
+jupyter-core==5.3.1
    # via
    #   -c requirements/constraints.in
    #   ipykernel
@ -165,13 +165,13 @@ matplotlib-inline==0.1.6
    # via
    #   ipykernel
    #   ipython
-mistune==2.0.5
+mistune==3.0.1
    # via nbconvert
 nbclassic==1.0.0
    # via notebook
 nbclient==0.8.0
    # via nbconvert
-nbconvert==7.5.0
+nbconvert==7.6.0
    # via
    #   jupyter
    #   jupyter-server
@ -219,7 +219,7 @@ pip-tools==6.13.0
    # via -r requirements/dev.in
 pkgutil-resolve-name==1.3.10
    # via jsonschema
-platformdirs==3.5.3
+platformdirs==3.6.0
    # via
    #   -c requirements/test.txt
    #   jupyter-core
@ -359,7 +359,7 @@ typing-extensions==4.6.3
    #   ipython
 uri-template==1.2.0
    # via jsonschema
-virtualenv==20.23.0
+virtualenv==20.23.1
    # via pre-commit
 wcwidth==0.2.6
    # via prompt-toolkit
@ -369,7 +369,7 @@ webencodings==0.5.1
    # via
    #   bleach
    #   tinycss2
-websocket-client==1.5.3
+websocket-client==1.6.0
    # via jupyter-server
 wheel==0.40.0
    # via
--- a/requirements/ingest-gcs.in
+++ b/requirements/ingest-gcs.in
@ -0,0 +1,4 @@
 -c constraints.in
 -c base.txt
 gcsfs
 fsspec
--- a/requirements/ingest-gcs.txt
+++ b/requirements/ingest-gcs.txt
@ -0,0 +1,105 @@
 #
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
 #    pip-compile requirements/ingest-gcs.in
 #
 aiohttp==3.8.4
    # via gcsfs
 aiosignal==1.3.1
    # via aiohttp
 async-timeout==4.0.2
    # via aiohttp
 attrs==23.1.0
    # via aiohttp
 cachetools==5.3.1
    # via google-auth
 certifi==2023.5.7
    # via
    #   -c requirements/base.txt
    #   -c requirements/constraints.in
    #   requests
 charset-normalizer==3.1.0
    # via
    #   -c requirements/base.txt
    #   aiohttp
    #   requests
 decorator==5.1.1
    # via gcsfs
 frozenlist==1.3.3
    # via
    #   aiohttp
    #   aiosignal
 fsspec==2023.6.0
    # via
    #   -r requirements/ingest-gcs.in
    #   gcsfs
 gcsfs==2023.6.0
    # via -r requirements/ingest-gcs.in
 google-api-core==2.11.1
    # via
    #   google-cloud-core
    #   google-cloud-storage
 google-auth==2.20.0
    # via
    #   gcsfs
    #   google-api-core
    #   google-auth-oauthlib
    #   google-cloud-core
    #   google-cloud-storage
 google-auth-oauthlib==1.0.0
    # via gcsfs
 google-cloud-core==2.3.2
    # via google-cloud-storage
 google-cloud-storage==2.9.0
    # via gcsfs
 google-crc32c==1.5.0
    # via google-resumable-media
 google-resumable-media==2.5.0
    # via google-cloud-storage
 googleapis-common-protos==1.59.1
    # via google-api-core
 idna==3.4
    # via
    #   -c requirements/base.txt
    #   requests
    #   yarl
 multidict==6.0.4
    # via
    #   aiohttp
    #   yarl
 oauthlib==3.2.2
    # via requests-oauthlib
 protobuf==3.20.3
    # via
    #   -c requirements/constraints.in
    #   google-api-core
 pyasn1==0.5.0
    # via
    #   pyasn1-modules
    #   rsa
 pyasn1-modules==0.3.0
    # via google-auth
 requests==2.31.0
    # via
    #   -c requirements/base.txt
    #   gcsfs
    #   google-api-core
    #   google-cloud-storage
    #   requests-oauthlib
 requests-oauthlib==1.3.1
    # via google-auth-oauthlib
 rsa==4.9
    # via google-auth
 six==1.16.0
    # via
    #   -c requirements/base.txt
    #   google-auth
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
    #   -c requirements/constraints.in
    #   google-auth
    #   requests
 yarl==1.9.2
    # via aiohttp
--- a/requirements/ingest-google-drive.txt
+++ b/requirements/ingest-google-drive.txt
@ -15,9 +15,9 @@ charset-normalizer==3.1.0
    # via
    #   -c requirements/base.txt
    #   requests
-google-api-core==2.11.0
+google-api-core==2.11.1
    # via google-api-python-client
-google-api-python-client==2.89.0
+google-api-python-client==2.90.0
    # via -r requirements/ingest-google-drive.in
 google-auth==2.20.0
    # via
@ -47,7 +47,7 @@ pyasn1==0.5.0
    #   rsa
 pyasn1-modules==0.3.0
    # via google-auth
-pyparsing==3.0.9
+pyparsing==3.1.0
    # via httplib2
 requests==2.31.0
    # via
--- a/requirements/ingest-reddit.txt
+++ b/requirements/ingest-reddit.txt
@ -33,5 +33,5 @@ urllib3==1.26.16
    #   -c requirements/base.txt
    #   -c requirements/constraints.in
    #   requests
-websocket-client==1.5.3
+websocket-client==1.6.0
    # via praw
--- a/requirements/local-inference.txt
+++ b/requirements/local-inference.txt
@ -87,7 +87,7 @@ numpy==1.23.5
    #   transformers
 omegaconf==2.3.0
    # via effdet
-onnxruntime==1.15.0
+onnxruntime==1.15.1
    # via unstructured-inference
 opencv-python==4.7.0.72
    # via
@ -136,7 +136,7 @@ pycparser==2.21
    # via
    #   -c requirements/base.txt
    #   cffi
-pyparsing==3.0.9
+pyparsing==3.1.0
    # via matplotlib
 pytesseract==0.3.10
    # via layoutparser
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -56,7 +56,7 @@ mccabe==0.7.0
    # via flake8
 multidict==6.0.4
    # via yarl
-mypy==1.3.0
+mypy==1.4.0
    # via -r requirements/test.in
 mypy-extensions==1.0.0
    # via
@ -69,7 +69,7 @@ packaging==23.1
    #   pytest
 pathspec==0.11.1
    # via black
-platformdirs==3.5.3
+platformdirs==3.6.0
    # via black
 pluggy==1.0.0
    # via pytest
@ -87,7 +87,7 @@ pytest==7.3.2
    #   pytest-mock
 pytest-cov==4.1.0
    # via -r requirements/test.in
-pytest-mock==3.10.0
+pytest-mock==3.11.1
    # via -r requirements/test.in
 python-dateutil==2.8.2
    # via
@ -99,7 +99,7 @@ requests==2.31.0
    # via
    #   -c requirements/base.txt
    #   label-studio-sdk
-ruff==0.0.272
+ruff==0.0.273
    # via -r requirements/test.in
 six==1.16.0
    # via
--- a/setup.py
+++ b/setup.py
@ -81,6 +81,7 @@ setup(
        "slack": load_requirements("requirements/ingest-slack.in"),
        "wikipedia": load_requirements("requirements/ingest-wikipedia.in"),
        "google-drive": load_requirements("requirements/ingest-google-drive.in"),
        "gcs": load_requirements("requirements/ingest-gcs.in"),
    },
    package_dir={"unstructured": "unstructured"},
    package_data={"unstructured": ["nlp/*.txt"]},
--- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
@ -0,0 +1,12 @@
 [
  {
    "type": "NarrativeText",
    "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
    "metadata": {
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
  }
 ]
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json
@ -0,0 +1,56 @@
 [
  {
    "type": "NarrativeText",
    "element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "This is a test document to use for unit tests."
  },
  {
    "type": "Address",
    "element_id": "a9d4657034aa3fdb5177f1325e912362",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Doylestown, PA 18901"
  },
  {
    "type": "Title",
    "element_id": "9c218520320f238595f1fde74bdd137d",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Important points:"
  },
  {
    "type": "ListItem",
    "element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Hamburgers are delicious"
  },
  {
    "type": "ListItem",
    "element_id": "fc1adcb8eaceac694e500a103f9f698f",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Dogs are the best"
  },
  {
    "type": "ListItem",
    "element_id": "0b61e826b1c4ab05750184da72b89f83",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "I love fuzzy blankets"
  }
 ]
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
@ -0,0 +1,12 @@
 [
  {
    "type": "NarrativeText",
    "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
    "metadata": {
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
  }
 ]
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json
@ -0,0 +1,56 @@
 [
  {
    "type": "NarrativeText",
    "element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "This is a test document to use for unit tests."
  },
  {
    "type": "Address",
    "element_id": "a9d4657034aa3fdb5177f1325e912362",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Doylestown, PA 18901"
  },
  {
    "type": "Title",
    "element_id": "9c218520320f238595f1fde74bdd137d",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Important points:"
  },
  {
    "type": "ListItem",
    "element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Hamburgers are delicious"
  },
  {
    "type": "ListItem",
    "element_id": "fc1adcb8eaceac694e500a103f9f698f",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "Dogs are the best"
  },
  {
    "type": "ListItem",
    "element_id": "0b61e826b1c4ab05750184da72b89f83",
    "metadata": {
      "data_source": {},
      "filetype": "text/plain"
    },
    "text": "I love fuzzy blankets"
  }
 ]
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
@ -0,0 +1,12 @@
 [
  {
    "type": "NarrativeText",
    "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
    "metadata": {
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
  }
 ]
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
@ -0,0 +1,26 @@
 [
  {
    "type": "Table",
    "element_id": "f8db6c6e535705336195aa2c1d23d414",
    "metadata": {
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
      "page_number": 1,
      "page_name": "Stanley Cups",
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n  \n    \n      Team\n      Location\n      Stanley Cups\n    \n    \n      Blues\n      STL\n      1\n    \n    \n      Flyers\n      PHI\n      2\n    \n    \n      Maple Leafs\n      TOR\n      13\n    \n  \n"
  },
  {
    "type": "Table",
    "element_id": "20f5163a43ac6eb04a40d269d3ad0663",
    "metadata": {
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
      "page_number": 2,
      "page_name": "Stanley Cups Since 67",
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n  \n    \n      Team\n      Location\n      Stanley Cups\n    \n    \n      Blues\n      STL\n      1\n    \n    \n      Flyers\n      PHI\n      2\n    \n    \n      Maple Leafs\n      TOR\n      0\n    \n  \n"
  }
 ]
--- a/test_unstructured_ingest/test-ingest-gcs.sh
+++ b/test_unstructured_ingest/test-ingest-gcs.sh
@ -0,0 +1,58 @@
 #!/usr/bin/env bash
 set -e
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 cd "$SCRIPT_DIR"/.. || exit 1
 if [[ "$(find test_unstructured_ingest/expected-structured-output/gcs/ -type f -size +1 | wc -l)" -ne 6 ]]; then
    echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small."
    echo "Did you overwrite test fixtures with bad outputs?"
    exit 1
 fi
 if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
   echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
   exit 0
 fi
 # Create a temporary file
 GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
 echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE"
 PYTHONPATH=. ./unstructured/ingest/main.py \
    --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
    --remote-url gs://utic-test-ingest-fixtures/ \
    --structured-output-dir gcs-output \
    --gcs-token  "$GCP_INGEST_SERVICE_KEY_FILE" \
    --recursive \
    --preserve-downloads \
    --reprocess 
 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
 set +e
 # to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
 if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
    EXPECTED_DIR=test_unstructured_ingest/expected-structured-output/gcs
    [ -d "$EXPECTED_DIR" ] && rm -rf "$EXPECTED_DIR"
    cp -R gcs-output $EXPECTED_DIR
 elif ! diff -ru test_unstructured_ingest/expected-structured-output/gcs gcs-output ; then
    echo
    echo "There are differences from the previously checked-in structured outputs."
    echo
    echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
    echo
    echo "  export OVERWRITE_FIXTURES=true"
    echo
    echo "and then rerun this script."
    echo
    echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
    echo "to update fixtures for CI,"
    echo
    exit 1
 fi
--- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
+++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
 PYTHONPATH=. ./unstructured/ingest/main.py \
    --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
    --local-input-path files-ingest-download \
-    --local-recursive \
+    --recursive \
    --local-file-glob "*.pdf" \
    --structured-output-dir pdf-fast-reprocess-ingest-output \
    --partition-strategy fast \
--- a/test_unstructured_ingest/test-ingest.sh
+++ b/test_unstructured_ingest/test-ingest.sh
@ -20,5 +20,6 @@ export OMP_THREAD_LIMIT=1
 ./test_unstructured_ingest/test-ingest-local.sh
 ./test_unstructured_ingest/test-ingest-slack.sh
 ./test_unstructured_ingest/test-ingest-against-api.sh
 ./test_unstructured_ingest/test-ingest-gcs.sh
 # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
 ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.7"  # pragma: no cover
+__version__ = "0.7.8-dev0"  # pragma: no cover
--- a/unstructured/ingest/connector/fsspec.py
+++ b/unstructured/ingest/connector/fsspec.py
@ -15,8 +15,11 @@ from unstructured.ingest.logger import logger
 SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
    "s3",
    "s3a",
    "abfs",
    "az",
    "gs",
    "gcs",
 ]
@ -24,6 +27,7 @@ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
 class SimpleFsspecConfig(BaseConnectorConfig):
    # fsspec specific options
    path: str
    recursive: bool
    access_kwargs: dict = field(default_factory=dict)
    protocol: str = field(init=False)
    path_without_protocol: str = field(init=False)
@ -81,7 +85,9 @@ class FsspecIngestDoc(BaseIngestDoc):
    def has_output(self):
        """Determine if structured output for this doc already exists."""
-        return self._output_filename().is_file() and os.path.getsize(self._output_filename())
+        return self._output_filename().is_file() and os.path.getsize(
            self._output_filename(),
        )
    def _create_full_tmp_dir_path(self):
        """Includes "directories" in the object path"""
@ -114,19 +120,24 @@ class FsspecIngestDoc(BaseIngestDoc):
        output_filename = self._output_filename()
        output_filename.parent.mkdir(parents=True, exist_ok=True)
        with open(output_filename, "w") as output_f:
-            output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
+            output_f.write(
                json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2),
            )
        logger.info(f"Wrote {output_filename}")
    @property
    def filename(self):
-        """The filename of the file after downloading from s3"""
+        """The filename of the file after downloading from cloud"""
        return self._tmp_download_file()
    def cleanup_file(self):
        """Removes the local copy of the file after successful processing."""
        if not self.standard_config.preserve_downloads and not self.standard_config.download_only:
            logger.debug(f"Cleaning up {self}")
-            os.unlink(self._tmp_download_file())
+            try:
                os.unlink(self._tmp_download_file())
            except OSError as e:  # Don't think we need to raise an exception
                logger.debug(f"Failed to remove {self._tmp_download_file()} due to {e}")
 class FsspecConnector(BaseConnector):
@ -151,7 +162,7 @@ class FsspecConnector(BaseConnector):
        )
    def cleanup(self, cur_dir=None):
-        """cleanup linginering empty sub-dirs from s3 paths, but leave remaining files
+        """cleanup linginering empty sub-dirs from cloud paths, but leave remaining files
        (and their paths) in tact as that indicates they were not processed"""
        if not self.cleanup_files:
            return
@ -177,7 +188,26 @@ class FsspecConnector(BaseConnector):
            )
    def _list_files(self):
-        return self.fs.ls(self.config.path_without_protocol)
+        if not self.config.recursive:
            # fs.ls does not walk directories
            # directories that are listed in cloud storage can cause problems
            # because they are seen as 0 byte files
            return [
                x.get("name")
                for x in self.fs.ls(self.config.path_without_protocol, detail=True)
                if x.get("size") > 0
            ]
        else:
            # fs.find will recursively walk directories
            # "size" is a common key for all the cloud protocols with fs
            return [
                k
                for k, v in self.fs.find(
                    self.config.path_without_protocol,
                    detail=True,
                ).items()
                if v.get("size") > 0
            ]
    def get_ingest_docs(self):
        return [
--- a/unstructured/ingest/connector/gcs.py
+++ b/unstructured/ingest/connector/gcs.py
@ -0,0 +1,33 @@
 from dataclasses import dataclass
 from typing import Type
 from unstructured.ingest.connector.fsspec import (
    FsspecConnector,
    FsspecIngestDoc,
    SimpleFsspecConfig,
 )
 from unstructured.ingest.interfaces import StandardConnectorConfig
 from unstructured.utils import requires_dependencies
@dataclass
 class SimpleGcsConfig(SimpleFsspecConfig):
    pass
 class GcsIngestDoc(FsspecIngestDoc):
    @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
    def get_file(self):
        super().get_file()
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
 class GcsConnector(FsspecConnector):
    ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
    def __init__(
        self,
        config: SimpleGcsConfig,
        standard_config: StandardConnectorConfig,
    ) -> None:
        super().__init__(standard_config, config)
--- a/unstructured/ingest/main.py
+++ b/unstructured/ingest/main.py
@ -161,12 +161,6 @@ class MainProcess:
    default=None,
    help="Path to the location in the local file system that will be processed.",
 )
@click.option(
    "--local-recursive",
    is_flag=True,
    default=False,
    help="Support recursive local file processing.",
 )
@click.option(
    "--local-file-glob",
    default=None,
@ -177,7 +171,14 @@ class MainProcess:
    "--remote-url",
    default=None,
    help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
-    "a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
+    "a directory or a single file. Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
    "and `az`.",
 )
@click.option(
    "--gcs-token",
    default=None,
    help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds"
    "or get creds from the google metadata service or fall back to anonymous access.",
 )
@click.option(
    "--s3-anonymous",
@ -211,13 +212,6 @@ class MainProcess:
    default=None,
    help="Path to the Google Drive service account json file.",
 )
@click.option(
    "--drive-recursive",
    is_flag=True,
    default=False,
    help="Recursively download files in folders from the Google Drive ID, "
    "otherwise stop at the files in provided folder level.",
 )
@click.option(
    "--drive-extension",
    default=None,
@ -412,17 +406,26 @@ class MainProcess:
    show_default=True,
    help="Number of parallel processes to process docs in.",
 )
@click.option(
    "--recursive",
    is_flag=True,
    default=False,
    help="Recursively download files in their respective folders"
    "otherwise stop at the files in provided folder level."
    " Supported protocols are: `gcs`, `gs`, `s3`, `s3a`, `abfs` "
    "`az`, `google drive` and `local`.",
 )
@click.option("-v", "--verbose", is_flag=True, default=False)
 def main(
    ctx,
    remote_url,
    s3_anonymous,
    gcs_token,
    azure_account_name,
    azure_account_key,
    azure_connection_string,
    drive_id,
    drive_service_account_key,
    drive_recursive,
    drive_extension,
    biomed_path,
    biomed_api_id,
@ -457,6 +460,7 @@ def main(
    structured_output_dir,
    reprocess,
    num_processes,
    recursive,
    verbose,
    metadata_include,
    metadata_exclude,
@ -468,7 +472,6 @@ def main(
    partition_strategy,
    api_key,
    local_input_path,
    local_recursive,
    local_file_glob,
    download_only,
 ):
@ -580,9 +583,21 @@ def main(
                standard_config=standard_config,
                config=SimpleS3Config(
                    path=remote_url,
                    recursive=recursive,
                    access_kwargs={"anon": s3_anonymous},
                ),
            )
        elif protocol in ("gs", "gcs"):
            from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig
            doc_connector = GcsConnector(  # type: ignore
                standard_config=standard_config,
                config=SimpleGcsConfig(
                    path=remote_url,
                    recursive=recursive,
                    access_kwargs={"token": gcs_token},
                ),
            )
        elif protocol in ("abfs", "az"):
            from unstructured.ingest.connector.azure import (
                AzureBlobStorageConnector,
@ -602,14 +617,15 @@ def main(
                standard_config=standard_config,
                config=SimpleAzureBlobStorageConfig(
                    path=remote_url,
                    recursive=recursive,
                    access_kwargs=access_kwargs,
                ),
            )
        else:
            warnings.warn(
                f"`fsspec` protocol {protocol} is not directly supported by `unstructured`,"
-                " so use it at your own risk. Supported protocols are `s3`, `s3a`, `abfs`,"
+                " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`,"
-                " and `az`.",
+                "`abfs` and `az`.",
                UserWarning,
            )
@ -622,6 +638,7 @@ def main(
                standard_config=standard_config,
                config=SimpleFsspecConfig(
                    path=remote_url,
                    recursive=recursive,
                ),
            )
    elif github_url:
@ -721,7 +738,7 @@ def main(
            config=SimpleGoogleDriveConfig(
                drive_id=drive_id,
                service_account_key=drive_service_account_key,
-                recursive=drive_recursive,
+                recursive=recursive,
                extension=drive_extension,
            ),
        )
@ -753,7 +770,7 @@ def main(
            standard_config=standard_config,
            config=SimpleLocalConfig(
                input_path=local_input_path,
-                recursive=local_recursive,
+                recursive=recursive,
                file_glob=local_file_glob,
            ),
        )
--- a/unstructured/utils.py
+++ b/unstructured/utils.py
@ -35,7 +35,7 @@ def requires_dependencies(
                raise ImportError(
                    f"Following dependencies are missing: {', '.join(missing_deps)}. "
                    + (
-                        f"Please install them using `pip install unstructured[{extras}]`."
+                        f"""Please install them using `pip install "unstructured[{extras}]"`."""
                        if extras
                        else f"Please install them using `pip install {' '.join(missing_deps)}`."
                    ),
`@ -1 +1 @@`
	`__version__ = "0.7.7" # pragma: no cover`	`__version__ = "0.7.8-dev0" # pragma: no cover`