From e88f7d9eab81edba3b268874823e6eb9825ed32e Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:51:08 -0400 Subject: [PATCH] chore: ingest test file cleanup (#1366) --- CHANGELOG.md | 2 +- .../check-diff-expected-output.sh | 17 +++- .../check-num-dirs-output.sh | 0 .../check-num-files-expected-output.sh | 0 .../check-num-files-output.sh | 0 .../check-num-rows-and-columns-output.sh | 0 test_unstructured_ingest/cleanup.sh | 16 ++++ .../python/test-ingest-delta-table-output.py | 9 +- .../fake-html-cp1252.html.json | 92 ------------------- .../test-ingest-against-api.sh | 6 +- .../test-ingest-airtable-diff.sh | 6 +- .../test-ingest-airtable-large.sh | 12 ++- test_unstructured_ingest/test-ingest-azure.sh | 6 +- .../test-ingest-biomed-api.sh | 8 +- .../test-ingest-biomed-path.sh | 8 +- test_unstructured_ingest/test-ingest-box.sh | 6 +- .../test-ingest-confluence-diff.sh | 6 +- .../test-ingest-confluence-large.sh | 8 +- .../test-ingest-delta-table.sh | 19 ++-- .../test-ingest-discord.sh | 6 +- .../test-ingest-dropbox.sh | 6 +- .../test-ingest-elasticsearch.sh | 25 +++-- test_unstructured_ingest/test-ingest-gcs.sh | 6 +- .../test-ingest-github.sh | 6 +- .../test-ingest-gitlab.sh | 6 +- .../test-ingest-google-drive.sh | 6 +- test_unstructured_ingest/test-ingest-jira.sh | 6 +- ...-ingest-local-single-file-with-encoding.sh | 6 +- ...gle-file-with-pdf-infer-table-structure.sh | 6 +- .../test-ingest-local-single-file.sh | 6 +- test_unstructured_ingest/test-ingest-local.sh | 6 +- .../test-ingest-notion.sh | 6 +- .../test-ingest-onedrive.sh | 6 +- .../test-ingest-outlook.sh | 6 +- .../test-ingest-pdf-fast-reprocess.sh | 6 +- test_unstructured_ingest/test-ingest-s3.sh | 9 +- .../test-ingest-salesforce.sh | 6 +- .../test-ingest-sharepoint.sh | 6 +- test_unstructured_ingest/test-ingest-slack.sh | 6 +- .../test-ingest-wikipedia.sh | 6 +- test_unstructured_ingest/test-ingest.sh | 86 ++++++++++------- unstructured/__version__.py | 2 +- 42 files changed, 275 insertions(+), 182 deletions(-) mode change 100644 => 100755 test_unstructured_ingest/check-diff-expected-output.sh mode change 100644 => 100755 test_unstructured_ingest/check-num-dirs-output.sh mode change 100644 => 100755 test_unstructured_ingest/check-num-files-expected-output.sh mode change 100644 => 100755 test_unstructured_ingest/check-num-files-output.sh mode change 100644 => 100755 test_unstructured_ingest/check-num-rows-and-columns-output.sh create mode 100644 test_unstructured_ingest/cleanup.sh delete mode 100644 test_unstructured_ingest/structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 502978f14..828200ca4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev2 +## 0.10.17-dev3 ### Enhancements diff --git a/test_unstructured_ingest/check-diff-expected-output.sh b/test_unstructured_ingest/check-diff-expected-output.sh old mode 100644 new mode 100755 index 8dbbf12cb..3a886d671 --- a/test_unstructured_ingest/check-diff-expected-output.sh +++ b/test_unstructured_ingest/check-diff-expected-output.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Description: Compare the structured output files to the expected output files and exit with an error -# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false", +# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false", # then this script will instead copy the output files to the expected output directory. # # Arguments: @@ -14,12 +14,27 @@ set +e SCRIPT_DIR=$(dirname "$(realpath "$0")") OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} +TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true} OUTPUT_FOLDER_NAME=$1 OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + +function cleanup() { + if [ "$TMP_DIRECTORY_CLEANUP" == "true" ]; then + cleanup_dir "$EXPECTED_OUTPUT_DIR_TEXT" + cleanup_dir "$OUTPUT_DIR_TEXT" + else + echo "skipping tmp directory cleanup" + fi +} + +trap cleanup EXIT + # to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64 if [ "$OVERWRITE_FIXTURES" != "false" ]; then # remove folder if it exists diff --git a/test_unstructured_ingest/check-num-dirs-output.sh b/test_unstructured_ingest/check-num-dirs-output.sh old mode 100644 new mode 100755 diff --git a/test_unstructured_ingest/check-num-files-expected-output.sh b/test_unstructured_ingest/check-num-files-expected-output.sh old mode 100644 new mode 100755 diff --git a/test_unstructured_ingest/check-num-files-output.sh b/test_unstructured_ingest/check-num-files-output.sh old mode 100644 new mode 100755 diff --git a/test_unstructured_ingest/check-num-rows-and-columns-output.sh b/test_unstructured_ingest/check-num-rows-and-columns-output.sh old mode 100644 new mode 100755 diff --git a/test_unstructured_ingest/cleanup.sh b/test_unstructured_ingest/cleanup.sh new file mode 100644 index 000000000..6271b674e --- /dev/null +++ b/test_unstructured_ingest/cleanup.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + + +function cleanup_dir() { + local dir_to_cleanup="${1}" + echo "--- Running cleanup of $dir_to_cleanup ---" + + if [ -d "$dir_to_cleanup" ]; then + echo "cleaning up directory: $dir_to_cleanup" + rm -rf "$dir_to_cleanup" + else + echo "$dir_to_cleanup does not exist or is not a directory, skipping deletion" + fi + + echo "--- Cleanup done ---" +} diff --git a/test_unstructured_ingest/python/test-ingest-delta-table-output.py b/test_unstructured_ingest/python/test-ingest-delta-table-output.py index 755a2533e..26c873ecd 100755 --- a/test_unstructured_ingest/python/test-ingest-delta-table-output.py +++ b/test_unstructured_ingest/python/test-ingest-delta-table-output.py @@ -1,12 +1,17 @@ +import click from deltalake import DeltaTable -def run_check(): +@click.command() +@click.option("--table-uri", type=str) +def run_check(table_uri): + print(f"Checking contents of table at {table_uri}") delta_table = DeltaTable( - table_uri="/tmp/delta-table-dest", + table_uri=table_uri, ) assert len(delta_table.to_pandas()) == 10 + print("table check complete") if __name__ == "__main__": diff --git a/test_unstructured_ingest/structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json b/test_unstructured_ingest/structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json deleted file mode 100644 index d6c38ede2..000000000 --- a/test_unstructured_ingest/structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json +++ /dev/null @@ -1,92 +0,0 @@ -[ - { - "type": "Title", - "element_id": "0540311f6c077fe8f797080918b8d74b", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "My First Heading" - }, - { - "type": "Title", - "element_id": "399af454cb1368b8257ed406b430de84", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "My first paragraph." - }, - { - "type": "Title", - "element_id": "b4cf0d13edfa976816649971bd640a66", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Some CP1252-specific characters:" - }, - { - "type": "UncategorizedText", - "element_id": "ada7c3084f437d31d297f85da3941a55", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 2 - }, - "text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯" - }, - { - "type": "UncategorizedText", - "element_id": "dda5e8c4d245c1954ecb64e5dfea598d", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 3 - }, - "text": "°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿" - }, - { - "type": "Title", - "element_id": "85df09b375e5813aefa3b5f30c8ddff8", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 4 - }, - "text": "À\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ" - }, - { - "type": "Title", - "element_id": "2726d2569cd7a6cecb79a6e46bb0b2b3", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 5 - }, - "text": "Ð\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß" - }, - { - "type": "Title", - "element_id": "2b01f3e428520f6e47d8513292688cf6", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 6 - }, - "text": "à\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï" - }, - { - "type": "Title", - "element_id": "5ed256e41bfb169af5f50524b9593a16", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 7 - }, - "text": "ð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index a32686778..f8ee644de 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -11,6 +11,10 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=api-ingest-output OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --api-key "$UNS_API_KEY" \ @@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --file-glob "*1p.txt" \ --input-path example-docs -sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index a0b75c722..1b6cdaaec 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=airtable-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + VARIED_DATA_BASE_ID="app5YQxSfp220fWtm" VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" @@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --reprocess \ --output-dir "$OUTPUT_DIR" -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-airtable-large.sh b/test_unstructured_ingest/test-ingest-airtable-large.sh index 35f1f435a..81470dda7 100755 --- a/test_unstructured_ingest/test-ingest-airtable-large.sh +++ b/test_unstructured_ingest/test-ingest-airtable-large.sh @@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=airtable-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." exit 0 @@ -35,16 +39,16 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ # We are expecting fifteen directories: fourteen bases and the parent directory -sh "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME" +"$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME" # We are expecting 101 files: 100 tables and the parent directory -sh "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/ +"$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/ # Test on ingesting a large number of bases for i in {1..12}; do var="LARGE_WORKSPACE_BASE_ID_$i" - sh "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}" + "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}" done # Test on ingesting a table with lots of rows -sh "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json +"$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index b6fe8132d..d30e9d61b 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=azure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ azure \ --download-dir "$DOWNLOAD_DIR" \ @@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --account-name azureunstructured1 \ --remote-url abfs://container1/ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index 56152e305..0d2e31898 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-api OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + +"$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k PYTHONPATH=. ./unstructured/ingest/main.py \ biomed \ @@ -28,4 +32,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --max-request-time 30 \ --max-retries 5 \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index 4f125edb2..8945e7f02 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-path OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + +"$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k PYTHONPATH=. ./unstructured/ingest/main.py \ biomed \ @@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --max-retries 5 \ --path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-box.sh b/test_unstructured_ingest/test-ingest-box.sh index a39c18783..bae700aef 100755 --- a/test_unstructured_ingest/test-ingest-box.sh +++ b/test_unstructured_ingest/test-ingest-box.sh @@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=box OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." exit 0 @@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --reprocess \ --verbose -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index 5e060a26d..3643bee03 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -10,6 +10,10 @@ OUTPUT_FOLDER_NAME=confluence-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." exit 0 @@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --api-token "$CONFLUENCE_API_TOKEN" \ --spaces testteamsp,MFS -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-confluence-large.sh b/test_unstructured_ingest/test-ingest-confluence-large.sh index 3c4ab864b..b1cece747 100755 --- a/test_unstructured_ingest/test-ingest-confluence-large.sh +++ b/test_unstructured_ingest/test-ingest-confluence-large.sh @@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=confluence-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." exit 0 @@ -43,7 +47,7 @@ OUTPUT_SUBFOLDER_NAME=testteamsp1 # Example: # Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large # Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1 -sh "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME" +"$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME" # We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided. -sh "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/ +"$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/ diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index 1f0cfb7f6..cfad28e10 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -7,25 +7,26 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=delta-table OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -DESTINATION_TABLE=/tmp/delta-table-dest +DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." exit 0 fi +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + function cleanup() { - if [ -d "$DESTINATION_TABLE" ]; then - echo "cleaning up tmp directory: $DESTINATION_TABLE" - rm -rf "$DESTINATION_TABLE" - fi + cleanup_dir "$DESTINATION_TABLE" + cleanup_dir "$OUTPUT_DIR" } trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ delta-table \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \ --output-dir "$OUTPUT_DIR" \ @@ -34,8 +35,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ delta-table \ --write-column json_data \ - --table-uri $DESTINATION_TABLE + --table-uri "$DESTINATION_TABLE" -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME -python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py +python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE" diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index 1a3b46158..5c1ace568 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=discord OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$DISCORD_TOKEN" ]; then echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." exit 0 @@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --channels 1099442333440802930,1099601456321003600 \ --token "$DISCORD_TOKEN" \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index 875cd9ff8..c921fbd06 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=dropbox OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" @@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --remote-url "dropbox:// /" -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index a7202b618..7bafefd9f 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -9,12 +9,24 @@ OUTPUT_FOLDER_NAME=elasticsearch OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -# shellcheck source=/dev/null -sh scripts/elasticsearch-test-helpers/create-and-check-es.sh -wait +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh -# Kill the container so the script can be repeatedly run using the same ports -trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT +function cleanup() { + # Kill the container so the script can be repeatedly run using the same ports + if docker ps --filter "name=es-test"; then + echo "Stopping Elasticsearch Docker container" + docker stop es-test + fi + + cleanup_dir "$OUTPUT_DIR" +} + +trap cleanup EXIT + +# shellcheck source=/dev/null +scripts/elasticsearch-test-helpers/create-and-check-es.sh +wait PYTHONPATH=. ./unstructured/ingest/main.py \ elasticsearch \ @@ -29,5 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --url http://localhost:9200 \ --jq-query '{ethnicity, director, plot}' -echo "SCRIPT_DIR: $SCRIPT_DIR" -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 02c42153c..7177b9fe3 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gcs OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." exit 0 @@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --remote-url gs://utic-test-ingest-fixtures/ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index c6d339467..d981d86ea 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=github OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} ACCESS_TOKEN_FLAGS="" @@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --git-file-glob '*.html,*.txt' \ $ACCESS_TOKEN_FLAGS -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index 4408e597c..6db148b0a 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gitlab OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ gitlab \ --download-dir "$DOWNLOAD_DIR" \ @@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --git-file-glob '*.md,*.txt' \ --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab -sh "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index 3df059f95..96495e4a2 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=google-drive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" @@ -32,4 +36,4 @@ PYTHONPATH=. unstructured/ingest/main.py \ --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index c91ca8d8c..a36b4db4a 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=jira-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." exit 0 @@ -52,4 +56,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh index cf6b134a6..2bea34519 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh @@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-encoding OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ @@ -18,4 +22,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh index 0a6ae5c57..2a47f2a50 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh @@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ @@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh index b40e4e5ba..dfd1c7bd9 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file.sh @@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ @@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-local.sh b/test_unstructured_ingest/test-ingest-local.sh index 559b97c20..a00a5802e 100755 --- a/test_unstructured_ingest/test-ingest-local.sh +++ b/test_unstructured_ingest/test-ingest-local.sh @@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ @@ -17,4 +21,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --file-glob "*.html" \ --input-path example-docs -sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index 95fdb2691..076ebe6f9 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=notion OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$NOTION_API_KEY" ]; then echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." exit 0 @@ -25,4 +29,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 90f0b8e8b..2e5a036e8 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=onedrive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." exit 0 @@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --path '/utic-test-ingest-fixtures' \ --recursive \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index 28b7a692b..25b807e11 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=outlook OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." exit 0 @@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index 953f4e5c1..85ff54529 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME INPUT_PATH=$SCRIPT_DIR/download +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + echo "REPROCESS INPUT PATH" ls "$INPUT_PATH" @@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index 2e577cbcf..5e5dc86e5 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -2,13 +2,18 @@ set -e + SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=s3 OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + +"$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k PYTHONPATH=. ./unstructured/ingest/main.py \ s3 \ @@ -23,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --anonymous -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index c62f6c4b9..104642c73 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=salesforce OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." exit 0 @@ -37,4 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --verbose -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 5dbefecfd..09358d296 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=Sharepoint OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." exit 0 @@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --path "Shared Documents" \ --recursive \ -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index 69e25dbae..9f30fa079 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=slack OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + if [ -z "$SLACK_TOKEN" ]; then echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." exit 0 @@ -27,4 +31,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --start-date 2023-04-01 \ --end-date 2023-04-08T12:00:00-08:00 -sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index a8f9cdef2..a9506b91d 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=wikipedia OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +trap 'cleanup_dir "$OUTPUT_DIR"' EXIT + PYTHONPATH=. ./unstructured/ingest/main.py \ wikipedia \ --download-dir "$DOWNLOAD_DIR" \ @@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --page-title "Open Source Software" -sh "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 04d945208..6c02af472 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -eux -o pipefail +set -eu -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd "$SCRIPT_DIR"/.. || exit 1 @@ -8,35 +8,55 @@ cd "$SCRIPT_DIR"/.. || exit 1 # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs export OMP_THREAD_LIMIT=1 -./test_unstructured_ingest/test-ingest-s3.sh -./test_unstructured_ingest/test-ingest-azure.sh -./test_unstructured_ingest/test-ingest-box.sh -./test_unstructured_ingest/test-ingest-discord.sh -./test_unstructured_ingest/test-ingest-dropbox.sh -./test_unstructured_ingest/test-ingest-github.sh -./test_unstructured_ingest/test-ingest-gitlab.sh -./test_unstructured_ingest/test-ingest-google-drive.sh -./test_unstructured_ingest/test-ingest-wikipedia.sh -./test_unstructured_ingest/test-ingest-biomed-api.sh -./test_unstructured_ingest/test-ingest-biomed-path.sh -./test_unstructured_ingest/test-ingest-local.sh -./test_unstructured_ingest/test-ingest-slack.sh -./test_unstructured_ingest/test-ingest-against-api.sh -./test_unstructured_ingest/test-ingest-gcs.sh -./test_unstructured_ingest/test-ingest-onedrive.sh -./test_unstructured_ingest/test-ingest-outlook.sh -./test_unstructured_ingest/test-ingest-elasticsearch.sh -./test_unstructured_ingest/test-ingest-confluence-diff.sh -./test_unstructured_ingest/test-ingest-confluence-large.sh -./test_unstructured_ingest/test-ingest-airtable-diff.sh -./test_unstructured_ingest/test-ingest-airtable-large.sh -./test_unstructured_ingest/test-ingest-local-single-file.sh -./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh -./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh -./test_unstructured_ingest/test-ingest-notion.sh -./test_unstructured_ingest/test-ingest-delta-table.sh -./test_unstructured_ingest/test-ingest-salesforce.sh -./test_unstructured_ingest/test-ingest-jira.sh -# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option -./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh -./test_unstructured_ingest/test-ingest-sharepoint.sh +scripts=( +'test-ingest-s3.sh' +'test-ingest-azure.sh' +'test-ingest-box.sh' +'test-ingest-discord.sh' +'test-ingest-dropbox.sh' +'test-ingest-github.sh' +'test-ingest-gitlab.sh' +'test-ingest-google-drive.sh' +'test-ingest-wikipedia.sh' +'test-ingest-biomed-api.sh' +'test-ingest-biomed-path.sh' +'test-ingest-local.sh' +'test-ingest-slack.sh' +'test-ingest-against-api.sh' +'test-ingest-gcs.sh' +'test-ingest-onedrive.sh' +'test-ingest-outlook.sh' +'test-ingest-elasticsearch.sh' +'test-ingest-confluence-diff.sh' +'test-ingest-confluence-large.sh' +'test-ingest-airtable-diff.sh' +'test-ingest-airtable-large.sh' +'test-ingest-local-single-file.sh' +'test-ingest-local-single-file-with-encoding.sh' +'test-ingest-local-single-file-with-pdf-infer-table-structure.sh' +'test-ingest-notion.sh' +'test-ingest-delta-table.sh' +'test-ingest-salesforce.sh' +'test-ingest-jira.sh' +## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option +'test-ingest-pdf-fast-reprocess.sh' +'test-ingest-sharepoint.sh' +) + +CURRENT_SCRIPT="none" + +function print_last_run() { + if [ "$CURRENT_SCRIPT" != "none" ]; then + echo "Last ran script: $CURRENT_SCRIPT" + fi +} + +trap print_last_run EXIT + +for script in "${scripts[@]}"; do + CURRENT_SCRIPT=$script + echo "--------- RUNNING SCRIPT $script ---------" + echo "Running ./test_unstructured_ingest/$script" + ./test_unstructured_ingest/"$script" + echo "--------- FINISHED SCRIPT $script ---------" +done diff --git a/unstructured/__version__.py b/unstructured/__version__.py index bbc523ae9..ae5d8af26 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev2" # pragma: no cover +__version__ = "0.10.17-dev3" # pragma: no cover