feat: add logging to ingest CLI for tests being skipped at the end (#2174)

### Description
Often times there are tests being skipped either due to missing env vars
or explicitly defined in the base script but these get lost in the logs.
This PR updates the scripts to leverage a custom error code if being
skipped due to missing env vars and this custom error code is being
caught by the base script and logs all files being skipped to a file. At
the end of the script, this file gets logged in the CI output.
This commit is contained in:
Roman Isecke 2023-11-29 08:41:19 -05:00 committed by GitHub
parent 0aae1faa54
commit b951d73a9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 111 additions and 94 deletions

1
.gitignore vendored
View File

@ -139,6 +139,7 @@ dmypy.json
/structured-output /structured-output
test_unstructured_ingest/workdir/ test_unstructured_ingest/workdir/
test_unstructured_ingest/delta-table-dest/ test_unstructured_ingest/delta-table-dest/
test_unstructured_ingest/skipped-files.txt
# suggested ingest mirror directory # suggested ingest mirror directory
/mirror /mirror

View File

@ -18,7 +18,7 @@ API_VERSION=2023-07-01-Preview
if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then
echo "Skipping Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." echo "Skipping Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set."
exit 0 exit 8
fi fi
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh source "$SCRIPT_DIR"/cleanup.sh

View File

@ -13,7 +13,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then
echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set."
exit 0 exit 8
fi fi
CONTAINER=utic-ingest-test-fixtures-output CONTAINER=utic-ingest-test-fixtures-output

View File

@ -16,7 +16,7 @@ CI=${CI:-"false"}
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
exit 0 exit 8
fi fi
# Get a new access token from Dropbox # Get a new access token from Dropbox

View File

@ -17,7 +17,7 @@ CI=${CI:-"false"}
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0 exit 8
fi fi
# Create temporary service key file # Create temporary service key file

View File

@ -15,7 +15,7 @@ CI=${CI:-"false"}
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set." echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
exit 0 exit 8
fi fi

View File

@ -18,22 +18,22 @@ CI=${CI:-"false"}
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 0 exit 8
fi fi
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
exit 0 exit 8
fi fi
if [ -z "$OPENAI_API_KEY" ]; then if [ -z "$OPENAI_API_KEY" ]; then
echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set." echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
exit 0 exit 8
fi fi
if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then
echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set."
exit 0 exit 8
fi fi
# shellcheck disable=SC1091 # shellcheck disable=SC1091

View File

@ -4,7 +4,7 @@ set -e
if [ -z "$UNS_API_KEY" ]; then if [ -z "$UNS_API_KEY" ]; then
echo "Skipping ingest test against api because the UNS_API_KEY env var is not set." echo "Skipping ingest test against api because the UNS_API_KEY env var is not set."
exit 0 exit 8
fi fi
SRC_PATH=$(dirname "$(realpath "$0")") SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH") SCRIPT_DIR=$(dirname "$SRC_PATH")

View File

@ -32,7 +32,7 @@ VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -30,7 +30,7 @@ trap cleanup EXIT
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
exit 0 exit 8
fi fi
# Provides component IDs such as LARGE_TEST_LIST_OF_PATHS, # Provides component IDs such as LARGE_TEST_LIST_OF_PATHS,

View File

@ -29,7 +29,7 @@ trap cleanup EXIT
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
exit 0 exit 8
fi fi
if [ -z "$BOX_APP_CONFIG_PATH" ]; then if [ -z "$BOX_APP_CONFIG_PATH" ]; then

View File

@ -28,7 +28,7 @@ trap cleanup EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -30,7 +30,7 @@ trap cleanup EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0 exit 8
fi fi
# The test checks the scenario where --confluence-list-of-spaces and --confluence-num-of-spaces # The test checks the scenario where --confluence-list-of-spaces and --confluence-num-of-spaces

View File

@ -15,7 +15,7 @@ CI=${CI:-"false"}
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
exit 0 exit 8
fi fi
# shellcheck disable=SC1091 # shellcheck disable=SC1091

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$DISCORD_TOKEN" ]; then if [ -z "$DISCORD_TOKEN" ]; then
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -27,7 +27,7 @@ trap cleanup EXIT
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
exit 0 exit 8
fi fi
# Get a new access token from Dropbox # Get a new access token from Dropbox

View File

@ -27,7 +27,7 @@ trap cleanup EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0 exit 8
fi fi
# Create temporary service key file # Create temporary service key file

View File

@ -28,7 +28,7 @@ trap cleanup EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
exit 0 exit 8
fi fi
# Create temporary service key file # Create temporary service key file

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$HUBSPOT_API_TOKEN" ]; then if [ -z "$HUBSPOT_API_TOKEN" ]; then
echo "Skipping HubSpot ingest test because the HUBSPOT_API_TOKEN env var is not set." echo "Skipping HubSpot ingest test because the HUBSPOT_API_TOKEN env var is not set."
exit 0 exit 8
fi fi
# Required arguments: # Required arguments:

View File

@ -27,7 +27,7 @@ trap cleanup EXIT
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
exit 0 exit 8
fi fi
# Required arguments: # Required arguments:

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$NOTION_API_KEY" ]; then if [ -z "$NOTION_API_KEY" ]; then
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -29,12 +29,12 @@ trap cleanup EXIT
if [ -z "$SALESFORCE_USERNAME" ] || [ -z "$SALESFORCE_CONSUMER_KEY" ]; then if [ -z "$SALESFORCE_USERNAME" ] || [ -z "$SALESFORCE_CONSUMER_KEY" ]; then
echo "Skipping Salesforce ingest test because SALESFORCE_USERNAME and SALESFORCE_CONSUMER_KEY env vars not set" echo "Skipping Salesforce ingest test because SALESFORCE_USERNAME and SALESFORCE_CONSUMER_KEY env vars not set"
exit 0 exit 8
fi fi
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
exit 0 exit 8
fi fi
if [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then if [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then

View File

@ -26,12 +26,12 @@ trap cleanup EXIT
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 0 exit 8
fi fi
if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ] ; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
exit 0 exit 8
fi fi
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly

View File

@ -27,7 +27,7 @@ trap cleanup EXIT
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 0 exit 8
fi fi
# excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly

View File

@ -26,7 +26,7 @@ trap cleanup EXIT
if [ -z "$SLACK_TOKEN" ]; then if [ -z "$SLACK_TOKEN" ]; then
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
exit 0 exit 8
fi fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}

View File

@ -1,8 +1,14 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -eu -o pipefail set -u -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
# If the file already exists, reset it
if [ -f "$SKIPPED_FILES_LOG" ]; then
rm "$SKIPPED_FILES_LOG"
touch "$SKIPPED_FILES_LOG"
fi
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
@ -52,17 +58,17 @@ for test in "${all_tests[@]}"; do
echo "--------- SKIPPING SCRIPT $test ---------" echo "--------- SKIPPING SCRIPT $test ---------"
continue continue
fi fi
if [[ "${tests_to_ignore[*]}" =~ $test ]]; then
echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES"
set +e
echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/dest/"$test"
set -e
echo "--------- FINISHED SCRIPT $test ---------"
else
echo "--------- RUNNING SCRIPT $test ---------" echo "--------- RUNNING SCRIPT $test ---------"
echo "Running ./test_unstructured_ingest/$test" echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/dest/"$test" ./test_unstructured_ingest/dest/"$test"
echo "--------- FINISHED SCRIPT $test ---------" rc=$?
if [[ $rc -eq 8 ]]; then
echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG"
elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then
echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG"
continue
elif [[ $rc -ne 0 ]]; then
exit $rc
fi fi
echo "--------- FINISHED SCRIPT $test ---------"
done done

View File

@ -1,51 +1,57 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -eu -o pipefail set -u -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
# If the file already exists, reset it
if [ -f "$SKIPPED_FILES_LOG" ]; then
rm "$SKIPPED_FILES_LOG"
touch "$SKIPPED_FILES_LOG"
fi
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1 export OMP_THREAD_LIMIT=1
all_tests=( all_tests=(
's3.sh' 's3.sh'
's3-minio.sh' 's3-minio.sh'
'azure.sh' 'azure.sh'
'biomed-api.sh' 'biomed-api.sh'
'biomed-path.sh' 'biomed-path.sh'
# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
'pdf-fast-reprocess.sh' 'pdf-fast-reprocess.sh'
'salesforce.sh' 'salesforce.sh'
'box.sh' 'box.sh'
'discord.sh' 'discord.sh'
'dropbox.sh' 'dropbox.sh'
'github.sh' 'github.sh'
'gitlab.sh' 'gitlab.sh'
'google-drive.sh' 'google-drive.sh'
'wikipedia.sh' 'wikipedia.sh'
'local.sh' 'local.sh'
'slack.sh' 'slack.sh'
'against-api.sh' 'against-api.sh'
'gcs.sh' 'gcs.sh'
'onedrive.sh' 'onedrive.sh'
'outlook.sh' 'outlook.sh'
'elasticsearch.sh' 'elasticsearch.sh'
'confluence-diff.sh' 'confluence-diff.sh'
'confluence-large.sh' 'confluence-large.sh'
'airtable-diff.sh' 'airtable-diff.sh'
# NOTE(ryan): This test is disabled because it is triggering too many requests to the API # NOTE(ryan): This test is disabled because it is triggering too many requests to the API
# 'airtable-large.sh' # 'airtable-large.sh'
'local-single-file.sh' 'local-single-file.sh'
'local-single-file-with-encoding.sh' 'local-single-file-with-encoding.sh'
'local-single-file-with-pdf-infer-table-structure.sh' 'local-single-file-with-pdf-infer-table-structure.sh'
'notion.sh' 'notion.sh'
'delta-table.sh' 'delta-table.sh'
'jira.sh' 'jira.sh'
'sharepoint.sh' 'sharepoint.sh'
'sharepoint-with-permissions.sh' 'sharepoint-with-permissions.sh'
'hubspot.sh' 'hubspot.sh'
'local-embed.sh' 'local-embed.sh'
) )
full_python_matrix_tests=( full_python_matrix_tests=(
@ -66,6 +72,8 @@ function print_last_run() {
if [ "$CURRENT_TEST" != "none" ]; then if [ "$CURRENT_TEST" != "none" ]; then
echo "Last ran script: $CURRENT_TEST" echo "Last ran script: $CURRENT_TEST"
fi fi
echo "######## SKIPPED TESTS: ########"
cat "$SKIPPED_FILES_LOG"
} }
trap print_last_run EXIT trap print_last_run EXIT
@ -85,21 +93,23 @@ for test in "${all_tests[@]}"; do
echo "--------- SKIPPING SCRIPT $test ---------" echo "--------- SKIPPING SCRIPT $test ---------"
continue continue
fi fi
if [[ "${tests_to_ignore[*]}" =~ $test ]]; then
echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES"
set +e
echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/src/"$test"
set -e
echo "--------- FINISHED SCRIPT $test ---------"
else
echo "--------- RUNNING SCRIPT $test ---------" echo "--------- RUNNING SCRIPT $test ---------"
echo "Running ./test_unstructured_ingest/$test" echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/src/"$test" ./test_unstructured_ingest/src/"$test"
echo "--------- FINISHED SCRIPT $test ---------" rc=$?
if [[ $rc -eq 8 ]]; then
echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG"
elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then
echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG"
continue
elif [[ $rc -ne 0 ]]; then
exit $rc
fi fi
echo "--------- FINISHED SCRIPT $test ---------"
done done
set +e
all_eval=( all_eval=(
'text-extraction' 'text-extraction'
'element-type' 'element-type'