diff --git a/.gitignore b/.gitignore index 008ba60ad..8cf3a69c1 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ dmypy.json /structured-output test_unstructured_ingest/workdir/ test_unstructured_ingest/delta-table-dest/ +test_unstructured_ingest/skipped-files.txt # suggested ingest mirror directory /mirror diff --git a/test_unstructured_ingest/dest/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh index bafbfa789..c3f12e41a 100755 --- a/test_unstructured_ingest/dest/azure-cognitive-search.sh +++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh @@ -18,7 +18,7 @@ API_VERSION=2023-07-01-Preview if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then echo "Skipping Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh index c2e3f7e59..74958eba2 100755 --- a/test_unstructured_ingest/dest/azure.sh +++ b/test_unstructured_ingest/dest/azure.sh @@ -13,7 +13,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." - exit 0 + exit 8 fi CONTAINER=utic-ingest-test-fixtures-output diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh index 6f15285c7..58148e8cb 100755 --- a/test_unstructured_ingest/dest/dropbox.sh +++ b/test_unstructured_ingest/dest/dropbox.sh @@ -16,7 +16,7 @@ CI=${CI:-"false"} if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" - exit 0 + exit 8 fi # Get a new access token from Dropbox diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh index f399257c8..169d556a8 100755 --- a/test_unstructured_ingest/dest/gcs.sh +++ b/test_unstructured_ingest/dest/gcs.sh @@ -17,7 +17,7 @@ CI=${CI:-"false"} if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh index 51bc6d90e..37b343a4a 100755 --- a/test_unstructured_ingest/dest/mongodb.sh +++ b/test_unstructured_ingest/dest/mongodb.sh @@ -15,7 +15,7 @@ CI=${CI:-"false"} if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set." - exit 0 + exit 8 fi diff --git a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh b/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh index 56e426231..d8853adfb 100755 --- a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh @@ -18,22 +18,22 @@ CI=${CI:-"false"} if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." - exit 0 + exit 8 fi if [ -z "$OPENAI_API_KEY" ]; then echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set." - exit 0 + exit 8 fi if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index 35a47be66..9ce5c1eb9 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -4,7 +4,7 @@ set -e if [ -z "$UNS_API_KEY" ]; then echo "Skipping ingest test against api because the UNS_API_KEY env var is not set." - exit 0 + exit 8 fi SRC_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$SRC_PATH") diff --git a/test_unstructured_ingest/src/airtable-diff.sh b/test_unstructured_ingest/src/airtable-diff.sh index 1e0f9c267..f6c5ac38b 100755 --- a/test_unstructured_ingest/src/airtable-diff.sh +++ b/test_unstructured_ingest/src/airtable-diff.sh @@ -32,7 +32,7 @@ VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/airtable-large.sh b/test_unstructured_ingest/src/airtable-large.sh index f4dd51034..d06ec37da 100755 --- a/test_unstructured_ingest/src/airtable-large.sh +++ b/test_unstructured_ingest/src/airtable-large.sh @@ -30,7 +30,7 @@ trap cleanup EXIT if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." - exit 0 + exit 8 fi # Provides component IDs such as LARGE_TEST_LIST_OF_PATHS, diff --git a/test_unstructured_ingest/src/box.sh b/test_unstructured_ingest/src/box.sh index 7efb02d5f..1fad83add 100755 --- a/test_unstructured_ingest/src/box.sh +++ b/test_unstructured_ingest/src/box.sh @@ -29,7 +29,7 @@ trap cleanup EXIT if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." - exit 0 + exit 8 fi if [ -z "$BOX_APP_CONFIG_PATH" ]; then diff --git a/test_unstructured_ingest/src/confluence-diff.sh b/test_unstructured_ingest/src/confluence-diff.sh index 066bde029..9034be52d 100755 --- a/test_unstructured_ingest/src/confluence-diff.sh +++ b/test_unstructured_ingest/src/confluence-diff.sh @@ -28,7 +28,7 @@ trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/confluence-large.sh b/test_unstructured_ingest/src/confluence-large.sh index ec2c9c7c6..b66c3d3ee 100755 --- a/test_unstructured_ingest/src/confluence-large.sh +++ b/test_unstructured_ingest/src/confluence-large.sh @@ -30,7 +30,7 @@ trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." - exit 0 + exit 8 fi # The test checks the scenario where --confluence-list-of-spaces and --confluence-num-of-spaces diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index e6286228e..5ddd0a71f 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -15,7 +15,7 @@ CI=${CI:-"false"} if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." - exit 0 + exit 8 fi # shellcheck disable=SC1091 diff --git a/test_unstructured_ingest/src/discord.sh b/test_unstructured_ingest/src/discord.sh index 34587a740..015deb993 100755 --- a/test_unstructured_ingest/src/discord.sh +++ b/test_unstructured_ingest/src/discord.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$DISCORD_TOKEN" ]; then echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/dropbox.sh b/test_unstructured_ingest/src/dropbox.sh index 45583a226..17127dc28 100755 --- a/test_unstructured_ingest/src/dropbox.sh +++ b/test_unstructured_ingest/src/dropbox.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" - exit 0 + exit 8 fi # Get a new access token from Dropbox diff --git a/test_unstructured_ingest/src/gcs.sh b/test_unstructured_ingest/src/gcs.sh index 51f3fbf72..de02fd651 100755 --- a/test_unstructured_ingest/src/gcs.sh +++ b/test_unstructured_ingest/src/gcs.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index 74068bd35..5f78182db 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -28,7 +28,7 @@ trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" - exit 0 + exit 8 fi # Create temporary service key file diff --git a/test_unstructured_ingest/src/hubspot.sh b/test_unstructured_ingest/src/hubspot.sh index 65bdee070..5b456a0e9 100755 --- a/test_unstructured_ingest/src/hubspot.sh +++ b/test_unstructured_ingest/src/hubspot.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$HUBSPOT_API_TOKEN" ]; then echo "Skipping HubSpot ingest test because the HUBSPOT_API_TOKEN env var is not set." - exit 0 + exit 8 fi # Required arguments: diff --git a/test_unstructured_ingest/src/jira.sh b/test_unstructured_ingest/src/jira.sh index bc1dd2eb5..112fd5fb0 100755 --- a/test_unstructured_ingest/src/jira.sh +++ b/test_unstructured_ingest/src/jira.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." - exit 0 + exit 8 fi # Required arguments: diff --git a/test_unstructured_ingest/src/notion.sh b/test_unstructured_ingest/src/notion.sh index 3c047b9a6..99bb2b207 100755 --- a/test_unstructured_ingest/src/notion.sh +++ b/test_unstructured_ingest/src/notion.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$NOTION_API_KEY" ]; then echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/onedrive.sh b/test_unstructured_ingest/src/onedrive.sh index 76ea58854..fec688dcb 100755 --- a/test_unstructured_ingest/src/onedrive.sh +++ b/test_unstructured_ingest/src/onedrive.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/outlook.sh b/test_unstructured_ingest/src/outlook.sh index 75443f5c7..b646c2d3f 100755 --- a/test_unstructured_ingest/src/outlook.sh +++ b/test_unstructured_ingest/src/outlook.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 64d82355b..faf2dd761 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -29,12 +29,12 @@ trap cleanup EXIT if [ -z "$SALESFORCE_USERNAME" ] || [ -z "$SALESFORCE_CONSUMER_KEY" ]; then echo "Skipping Salesforce ingest test because SALESFORCE_USERNAME and SALESFORCE_CONSUMER_KEY env vars not set" - exit 0 + exit 8 fi if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." - exit 0 + exit 8 fi if [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then diff --git a/test_unstructured_ingest/src/sharepoint-with-permissions.sh b/test_unstructured_ingest/src/sharepoint-with-permissions.sh index 29f95e1cb..80f30cb10 100755 --- a/test_unstructured_ingest/src/sharepoint-with-permissions.sh +++ b/test_unstructured_ingest/src/sharepoint-with-permissions.sh @@ -26,12 +26,12 @@ trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." - exit 0 + exit 8 fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index 5703a0366..678f2b172 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -27,7 +27,7 @@ trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 0 + exit 8 fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly diff --git a/test_unstructured_ingest/src/slack.sh b/test_unstructured_ingest/src/slack.sh index c9e113f2b..620184b1b 100755 --- a/test_unstructured_ingest/src/slack.sh +++ b/test_unstructured_ingest/src/slack.sh @@ -26,7 +26,7 @@ trap cleanup EXIT if [ -z "$SLACK_TOKEN" ]; then echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." - exit 0 + exit 8 fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh index 478f5428c..54ae356dc 100755 --- a/test_unstructured_ingest/test-ingest-dest.sh +++ b/test_unstructured_ingest/test-ingest-dest.sh @@ -1,8 +1,14 @@ #!/usr/bin/env bash -set -eu -o pipefail +set -u -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt +# If the file already exists, reset it +if [ -f "$SKIPPED_FILES_LOG" ]; then + rm "$SKIPPED_FILES_LOG" + touch "$SKIPPED_FILES_LOG" +fi cd "$SCRIPT_DIR"/.. || exit 1 # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs @@ -52,17 +58,17 @@ for test in "${all_tests[@]}"; do echo "--------- SKIPPING SCRIPT $test ---------" continue fi - if [[ "${tests_to_ignore[*]}" =~ $test ]]; then - echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES" - set +e - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/dest/"$test" - set -e - echo "--------- FINISHED SCRIPT $test ---------" - else - echo "--------- RUNNING SCRIPT $test ---------" - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/dest/"$test" - echo "--------- FINISHED SCRIPT $test ---------" + echo "--------- RUNNING SCRIPT $test ---------" + echo "Running ./test_unstructured_ingest/$test" + ./test_unstructured_ingest/dest/"$test" + rc=$? + if [[ $rc -eq 8 ]]; then + echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG" + elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then + echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG" + continue + elif [[ $rc -ne 0 ]]; then + exit $rc fi + echo "--------- FINISHED SCRIPT $test ---------" done diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index e8ad24eab..b53e6e824 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -1,51 +1,57 @@ #!/usr/bin/env bash -set -eu -o pipefail +set -u -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt +# If the file already exists, reset it +if [ -f "$SKIPPED_FILES_LOG" ]; then + rm "$SKIPPED_FILES_LOG" + touch "$SKIPPED_FILES_LOG" +fi cd "$SCRIPT_DIR"/.. || exit 1 # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs export OMP_THREAD_LIMIT=1 all_tests=( -'s3.sh' -'s3-minio.sh' -'azure.sh' -'biomed-api.sh' -'biomed-path.sh' -# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files -'pdf-fast-reprocess.sh' -'salesforce.sh' -'box.sh' -'discord.sh' -'dropbox.sh' -'github.sh' -'gitlab.sh' -'google-drive.sh' -'wikipedia.sh' -'local.sh' -'slack.sh' -'against-api.sh' -'gcs.sh' -'onedrive.sh' -'outlook.sh' -'elasticsearch.sh' -'confluence-diff.sh' -'confluence-large.sh' -'airtable-diff.sh' -# NOTE(ryan): This test is disabled because it is triggering too many requests to the API -# 'airtable-large.sh' -'local-single-file.sh' -'local-single-file-with-encoding.sh' -'local-single-file-with-pdf-infer-table-structure.sh' -'notion.sh' -'delta-table.sh' -'jira.sh' -'sharepoint.sh' -'sharepoint-with-permissions.sh' -'hubspot.sh' -'local-embed.sh' + 's3.sh' + 's3-minio.sh' + 'azure.sh' + 'biomed-api.sh' + 'biomed-path.sh' + # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files + 'pdf-fast-reprocess.sh' + 'salesforce.sh' + 'box.sh' + 'discord.sh' + 'dropbox.sh' + 'github.sh' + 'gitlab.sh' + 'google-drive.sh' + 'wikipedia.sh' + 'local.sh' + 'slack.sh' + 'against-api.sh' + 'gcs.sh' + 'onedrive.sh' + 'outlook.sh' + 'elasticsearch.sh' + 'confluence-diff.sh' + 'confluence-large.sh' + 'airtable-diff.sh' + # NOTE(ryan): This test is disabled because it is triggering too many requests to the API + # 'airtable-large.sh' + 'local-single-file.sh' + 'local-single-file-with-encoding.sh' + 'local-single-file-with-pdf-infer-table-structure.sh' + 'notion.sh' + 'delta-table.sh' + 'jira.sh' + 'sharepoint.sh' + 'sharepoint-with-permissions.sh' + 'hubspot.sh' + 'local-embed.sh' ) full_python_matrix_tests=( @@ -66,6 +72,8 @@ function print_last_run() { if [ "$CURRENT_TEST" != "none" ]; then echo "Last ran script: $CURRENT_TEST" fi + echo "######## SKIPPED TESTS: ########" + cat "$SKIPPED_FILES_LOG" } trap print_last_run EXIT @@ -85,21 +93,23 @@ for test in "${all_tests[@]}"; do echo "--------- SKIPPING SCRIPT $test ---------" continue fi - if [[ "${tests_to_ignore[*]}" =~ $test ]]; then - echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES" - set +e - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/src/"$test" - set -e - echo "--------- FINISHED SCRIPT $test ---------" - else - echo "--------- RUNNING SCRIPT $test ---------" - echo "Running ./test_unstructured_ingest/$test" - ./test_unstructured_ingest/src/"$test" - echo "--------- FINISHED SCRIPT $test ---------" + echo "--------- RUNNING SCRIPT $test ---------" + echo "Running ./test_unstructured_ingest/$test" + ./test_unstructured_ingest/src/"$test" + rc=$? + if [[ $rc -eq 8 ]]; then + echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG" + elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then + echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG" + continue + elif [[ $rc -ne 0 ]]; then + exit $rc fi + echo "--------- FINISHED SCRIPT $test ---------" done +set +e + all_eval=( 'text-extraction' 'element-type'