diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dce156520..1b7fc8f80 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -101,6 +101,8 @@ jobs: source .venv/bin/activate make install-ci - name: Test + env: + GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} run: | source .venv/bin/activate make install-detectron2 diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index c90fe1a7e..3488537dd 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -3,7 +3,17 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd "$SCRIPT_DIR"/.. || exit 1 -if [[ "$CI" == "true" ]]; then +GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} + +ACCESS_TOKEN_FLAGS="" +# to update test fixtures, "export OVERWRITE_FIXTURES=true" and rerun this script +if [[ "$GH_READ_ONLY_ACCESS_TOKEN" != "none" ]]; then + ACCESS_TOKEN_FLAGS="--git-access-token $GH_READ_ONLY_ACCESS_TOKEN" +elif [[ "$CI" == "true" ]]; then + echo "Warning: GH_READ_ONLY_ACCESS_TOKEN is not defined in the CI environment." + echo "This can lead to intermittent failures in test-ingest-github.sh, as non-auth'ed" + echo "requests are severely rate limited by GitHub." + echo if [ "$(( RANDOM % 10))" -lt 1 ] ; then # NOTE(crag): proper fix is being tracked here: https://github.com/Unstructured-IO/unstructured/issues/306 echo "Skipping ingest 90% of github ingest tests to avoid rate limiting issue." @@ -11,7 +21,7 @@ if [[ "$CI" == "true" ]]; then fi fi - +#shellcheck disable=SC2086 PYTHONPATH=. ./unstructured/ingest/main.py \ --metadata-exclude filename \ --github-url dcneiner/Downloadify \ @@ -19,7 +29,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --structured-output-dir github-downloadify-output \ --reprocess \ --preserve-downloads \ - --verbose + --verbose $ACCESS_TOKEN_FLAGS OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}