From b951d73a9bf783060b0e009f78f17c74abeca9d0 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Wed, 29 Nov 2023 08:41:19 -0500 Subject: [PATCH] feat: add logging to ingest CLI for tests being skipped at the end (#2174) ### Description Often times there are tests being skipped either due to missing env vars or explicitly defined in the base script but these get lost in the logs. This PR updates the scripts to leverage a custom error code if being skipped due to missing env vars and this custom error code is being caught by the base script and logs all files being skipped to a file. At the end of the script, this file gets logged in the CI output. --- .gitignore | 1 + .../dest/azure-cognitive-search.sh | 2 +- test_unstructured_ingest/dest/azure.sh | 2 +- test_unstructured_ingest/dest/dropbox.sh | 2 +- test_unstructured_ingest/dest/gcs.sh | 2 +- test_unstructured_ingest/dest/mongodb.sh | 2 +- .../dest/sharepoint-embed-cog-index.sh | 8 +- test_unstructured_ingest/src/against-api.sh | 2 +- test_unstructured_ingest/src/airtable-diff.sh | 2 +- .../src/airtable-large.sh | 2 +- test_unstructured_ingest/src/box.sh | 2 +- .../src/confluence-diff.sh | 2 +- .../src/confluence-large.sh | 2 +- test_unstructured_ingest/src/delta-table.sh | 2 +- test_unstructured_ingest/src/discord.sh | 2 +- test_unstructured_ingest/src/dropbox.sh | 2 +- test_unstructured_ingest/src/gcs.sh | 2 +- test_unstructured_ingest/src/google-drive.sh | 2 +- test_unstructured_ingest/src/hubspot.sh | 2 +- test_unstructured_ingest/src/jira.sh | 2 +- test_unstructured_ingest/src/notion.sh | 2 +- test_unstructured_ingest/src/onedrive.sh | 2 +- test_unstructured_ingest/src/outlook.sh | 2 +- test_unstructured_ingest/src/salesforce.sh | 4 +- .../src/sharepoint-with-permissions.sh | 4 +- test_unstructured_ingest/src/sharepoint.sh | 2 +- test_unstructured_ingest/src/slack.sh | 2 +- test_unstructured_ingest/test-ingest-dest.sh | 32 ++--- test_unstructured_ingest/test-ingest-src.sh | 110 ++++++++++-------- 29 files changed, 111 insertions(+), 94 deletions(-) diff --git a/.gitignore b/.gitignore index 008ba60ad..8cf3a69c1 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ dmypy.json /structured-output test_unstructured_ingest/workdir/ test_unstructured_ingest/delta-table-dest/ +test_unstructured_ingest/skipped-files.txt # suggested ingest mirror directory /mirror diff --git a/test_unstructured_ingest/dest/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh index bafbfa789..c3f12e41a 100755 --- a/test_unstructured_ingest/dest/azure-cognitive-search.sh +++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh @@ -18,7 +18,7 @@ API_VERSION=2023-07-01-Preview if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then echo "Skipping Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh index c2e3f7e59..74958eba2 100755 --- a/test_unstructured_ingest/dest/azure.sh +++ b/test_unstructured_ingest/dest/azure.sh @@ -13,7 +13,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." - exit 0 + exit 8 fi CONTAINER=utic-ingest-test-fixtures-output diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh index 6f15285c7..58148e8cb 100755 --- a/test_unstructured_ingest/dest/dropbox.sh +++ b/test_unstructured_ingest/dest/dropbox.sh @@ -16,7 +16,7 @@ CI=${CI:-"false"} if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" - exit 0 + exit 8 fi # Get a new access token from Dropbox diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh index f399257c8..169d556a8 100755 --- a/test_unstructured_ingest/dest/gcs.sh +++ b/test_unstructured_ingest/dest/gcs.sh @@ -17,7 +17,7 @@ CI=${CI:-"false"} if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh index 51bc6d90e..37b343a4a 100755 --- a/test_unstructured_ingest/dest/mongodb.sh +++ b/test_unstructured_ingest/dest/mongodb.sh @@ -15,7 +15,7 @@ CI=${CI:-"false"} if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set." - exit 0 + exit 8 fi diff --git a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh b/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh index 56e426231..d8853adfb 100755 --- a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh @@ -18,22 +18,22 @@ CI=${CI:-"false"} if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." - exit 0 + exit 8 fi if [ -z "$OPENAI_API_KEY" ]; then echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set." - exit 0 + exit 8 fi if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index 35a47be66..9ce5c1eb9 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -4,7 +4,7 @@ set -e if [ -z "$UNS_API_KEY" ]; then echo "Skipping ingest test against api because the UNS_API_KEY env var is not set." - exit 0 + exit 8 fi SRC_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$SRC_PATH") diff --git a/test_unstructured_ingest/src/airtable-diff.sh b/test_unstructured_ingest/src/airtable-diff.sh index 1e0f9c267..f6c5ac38b 100755 --- a/test_unstructured_ingest/src/airtable-diff.sh +++ b/test_unstructured_ingest/src/airtable-diff.sh @@ -32,7 +32,7 @@ VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/airtable-large.sh b/test_unstructured_ingest/src/airtable-large.sh index f4dd51034..d06ec37da 100755 --- a/test_unstructured_ingest/src/airtable-large.sh +++ b/test_unstructured_ingest/src/airtable-large.sh @@ -30,7 +30,7 @@ trap cleanup EXIT if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." - exit 0 + exit 8 fi # Provides component IDs such as LARGE_TEST_LIST_OF_PATHS, diff --git a/test_unstructured_ingest/src/box.sh b/test_unstructured_ingest/src/box.sh index 7efb02d5f..1fad83add 100755 --- a/test_unstructured_ingest/src/box.sh +++ b/test_unstructured_ingest/src/box.sh @@ -29,7 +29,7 @@ trap cleanup EXIT if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." - exit 0 + exit 8 fi if [ -z "$BOX_APP_CONFIG_PATH" ]; then diff --git a/test_unstructured_ingest/src/confluence-diff.sh b/test_unstructured_ingest/src/confluence-diff.sh index 066bde029..9034be52d 100755 --- a/test_unstructured_ingest/src/confluence-diff.sh +++ b/test_unstructured_ingest/src/confluence-diff.sh @@ -28,7 +28,7 @@ trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/confluence-large.sh b/test_unstructured_ingest/src/confluence-large.sh index ec2c9c7c6..b66c3d3ee 100755 --- a/test_unstructured_ingest/src/confluence-large.sh +++ b/test_unstructured_ingest/src/confluence-large.sh @@ -30,7 +30,7 @@ trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." - exit 0 + exit 8 fi # The test checks the scenario where --confluence-list-of-spaces and --confluence-num-of-spaces diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index e6286228e..5ddd0a71f 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -15,7 +15,7 @@ CI=${CI:-"false"} if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 diff --git a/test_unstructured_ingest/src/discord.sh b/test_unstructured_ingest/src/discord.sh index 34587a740..015deb993 100755 --- a/test_unstructured_ingest/src/discord.sh +++ b/test_unstructured_ingest/src/discord.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$DISCORD_TOKEN" ]; then echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/dropbox.sh b/test_unstructured_ingest/src/dropbox.sh index 45583a226..17127dc28 100755 --- a/test_unstructured_ingest/src/dropbox.sh +++ b/test_unstructured_ingest/src/dropbox.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" - exit 0 + exit 8 fi # Get a new access token from Dropbox diff --git a/test_unstructured_ingest/src/gcs.sh b/test_unstructured_ingest/src/gcs.sh index 51f3fbf72..de02fd651 100755 --- a/test_unstructured_ingest/src/gcs.sh +++ b/test_unstructured_ingest/src/gcs.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index 74068bd35..5f78182db 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -28,7 +28,7 @@ trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/src/hubspot.sh b/test_unstructured_ingest/src/hubspot.sh index 65bdee070..5b456a0e9 100755 --- a/test_unstructured_ingest/src/hubspot.sh +++ b/test_unstructured_ingest/src/hubspot.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$HUBSPOT_API_TOKEN" ]; then echo "Skipping HubSpot ingest test because the HUBSPOT_API_TOKEN env var is not set." - exit 0 + exit 8 fi # Required arguments: diff --git a/test_unstructured_ingest/src/jira.sh b/test_unstructured_ingest/src/jira.sh index bc1dd2eb5..112fd5fb0 100755 --- a/test_unstructured_ingest/src/jira.sh +++ b/test_unstructured_ingest/src/jira.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." - exit 0 + exit 8 fi # Required arguments: diff --git a/test_unstructured_ingest/src/notion.sh b/test_unstructured_ingest/src/notion.sh index 3c047b9a6..99bb2b207 100755 --- a/test_unstructured_ingest/src/notion.sh +++ b/test_unstructured_ingest/src/notion.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$NOTION_API_KEY" ]; then echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/onedrive.sh b/test_unstructured_ingest/src/onedrive.sh index 76ea58854..fec688dcb 100755 --- a/test_unstructured_ingest/src/onedrive.sh +++ b/test_unstructured_ingest/src/onedrive.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/outlook.sh b/test_unstructured_ingest/src/outlook.sh index 75443f5c7..b646c2d3f 100755 --- a/test_unstructured_ingest/src/outlook.sh +++ b/test_unstructured_ingest/src/outlook.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 64d82355b..faf2dd761 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -29,12 +29,12 @@ trap cleanup EXIT if [ -z "$SALESFORCE_USERNAME" ] || [ -z "$SALESFORCE_CONSUMER_KEY" ]; then echo "Skipping Salesforce ingest test because SALESFORCE_USERNAME and SALESFORCE_CONSUMER_KEY env vars not set" - exit 0 + exit 8 fi if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." - exit 0 + exit 8 fi if [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then diff --git a/test_unstructured_ingest/src/sharepoint-with-permissions.sh b/test_unstructured_ingest/src/sharepoint-with-permissions.sh index 29f95e1cb..80f30cb10 100755 --- a/test_unstructured_ingest/src/sharepoint-with-permissions.sh +++ b/test_unstructured_ingest/src/sharepoint-with-permissions.sh @@ -26,12 +26,12 @@ trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." - exit 0 + exit 8 fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index 5703a0366..678f2b172 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly diff --git a/test_unstructured_ingest/src/slack.sh b/test_unstructured_ingest/src/slack.sh index c9e113f2b..620184b1b 100755 --- a/test_unstructured_ingest/src/slack.sh +++ b/test_unstructured_ingest/src/slack.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$SLACK_TOKEN" ]; then echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh index 478f5428c..54ae356dc 100755 --- a/test_unstructured_ingest/test-ingest-dest.sh +++ b/test_unstructured_ingest/test-ingest-dest.sh @@ -1,8 +1,14 @@ #!/usr/bin/env bash -set -eu -o pipefail +set -u -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt +# If the file already exists, reset it +if [ -f "$SKIPPED_FILES_LOG" ]; then + rm "$SKIPPED_FILES_LOG" + touch "$SKIPPED_FILES_LOG" +fi cd "$SCRIPT_DIR"/.. || exit 1 # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs @@ -52,17 +58,17 @@ for test in "${all_tests[@]}"; do echo "--------- SKIPPING SCRIPT $test ---------" continue fi - if [[ "${tests_to_ignore[*]}" =~ $test ]]; then - echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES" - set +e - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/dest/"$test" - set -e - echo "--------- FINISHED SCRIPT $test ---------" - else - echo "--------- RUNNING SCRIPT $test ---------" - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/dest/"$test" - echo "--------- FINISHED SCRIPT $test ---------" + echo "--------- RUNNING SCRIPT $test ---------" + echo "Running ./test_unstructured_ingest/$test" + ./test_unstructured_ingest/dest/"$test" + rc=$? + if [[ $rc -eq 8 ]]; then + echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG" + elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then + echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG" + continue + elif [[ $rc -ne 0 ]]; then + exit $rc fi + echo "--------- FINISHED SCRIPT $test ---------" done diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index e8ad24eab..b53e6e824 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -1,51 +1,57 @@ #!/usr/bin/env bash -set -eu -o pipefail +set -u -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt +# If the file already exists, reset it +if [ -f "$SKIPPED_FILES_LOG" ]; then + rm "$SKIPPED_FILES_LOG" + touch "$SKIPPED_FILES_LOG" +fi cd "$SCRIPT_DIR"/.. || exit 1 # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs export OMP_THREAD_LIMIT=1 all_tests=( -'s3.sh' -'s3-minio.sh' -'azure.sh' -'biomed-api.sh' -'biomed-path.sh' -# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files -'pdf-fast-reprocess.sh' -'salesforce.sh' -'box.sh' -'discord.sh' -'dropbox.sh' -'github.sh' -'gitlab.sh' -'google-drive.sh' -'wikipedia.sh' -'local.sh' -'slack.sh' -'against-api.sh' -'gcs.sh' -'onedrive.sh' -'outlook.sh' -'elasticsearch.sh' -'confluence-diff.sh' -'confluence-large.sh' -'airtable-diff.sh' -# NOTE(ryan): This test is disabled because it is triggering too many requests to the API -# 'airtable-large.sh' -'local-single-file.sh' -'local-single-file-with-encoding.sh' -'local-single-file-with-pdf-infer-table-structure.sh' -'notion.sh' -'delta-table.sh' -'jira.sh' -'sharepoint.sh' -'sharepoint-with-permissions.sh' -'hubspot.sh' -'local-embed.sh' + 's3.sh' + 's3-minio.sh' + 'azure.sh' + 'biomed-api.sh' + 'biomed-path.sh' + # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files + 'pdf-fast-reprocess.sh' + 'salesforce.sh' + 'box.sh' + 'discord.sh' + 'dropbox.sh' + 'github.sh' + 'gitlab.sh' + 'google-drive.sh' + 'wikipedia.sh' + 'local.sh' + 'slack.sh' + 'against-api.sh' + 'gcs.sh' + 'onedrive.sh' + 'outlook.sh' + 'elasticsearch.sh' + 'confluence-diff.sh' + 'confluence-large.sh' + 'airtable-diff.sh' + # NOTE(ryan): This test is disabled because it is triggering too many requests to the API + # 'airtable-large.sh' + 'local-single-file.sh' + 'local-single-file-with-encoding.sh' + 'local-single-file-with-pdf-infer-table-structure.sh' + 'notion.sh' + 'delta-table.sh' + 'jira.sh' + 'sharepoint.sh' + 'sharepoint-with-permissions.sh' + 'hubspot.sh' + 'local-embed.sh' ) full_python_matrix_tests=( @@ -66,6 +72,8 @@ function print_last_run() { if [ "$CURRENT_TEST" != "none" ]; then echo "Last ran script: $CURRENT_TEST" fi + echo "######## SKIPPED TESTS: ########" + cat "$SKIPPED_FILES_LOG" } trap print_last_run EXIT @@ -85,21 +93,23 @@ for test in "${all_tests[@]}"; do echo "--------- SKIPPING SCRIPT $test ---------" continue fi - if [[ "${tests_to_ignore[*]}" =~ $test ]]; then - echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES" - set +e - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/src/"$test" - set -e - echo "--------- FINISHED SCRIPT $test ---------" - else - echo "--------- RUNNING SCRIPT $test ---------" - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/src/"$test" - echo "--------- FINISHED SCRIPT $test ---------" + echo "--------- RUNNING SCRIPT $test ---------" + echo "Running ./test_unstructured_ingest/$test" + ./test_unstructured_ingest/src/"$test" + rc=$? + if [[ $rc -eq 8 ]]; then + echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG" + elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then + echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG" + continue + elif [[ $rc -ne 0 ]]; then + exit $rc fi + echo "--------- FINISHED SCRIPT $test ---------" done +set +e + all_eval=( 'text-extraction' 'element-type'