chore: ingest test file cleanup (#1366)

This commit is contained in:
Roman Isecke 2023-09-21 14:51:08 -04:00 committed by GitHub
parent 9e88929a8c
commit e88f7d9eab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 275 additions and 182 deletions

View File

@ -1,4 +1,4 @@
## 0.10.17-dev2 ## 0.10.17-dev3
### Enhancements ### Enhancements

17
test_unstructured_ingest/check-diff-expected-output.sh Normal file → Executable file
View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Description: Compare the structured output files to the expected output files and exit with an error # Description: Compare the structured output files to the expected output files and exit with an error
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false", # if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
# then this script will instead copy the output files to the expected output directory. # then this script will instead copy the output files to the expected output directory.
# #
# Arguments: # Arguments:
@ -14,12 +14,27 @@ set +e
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
OUTPUT_FOLDER_NAME=$1 OUTPUT_FOLDER_NAME=$1
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
if [ "$TMP_DIRECTORY_CLEANUP" == "true" ]; then
cleanup_dir "$EXPECTED_OUTPUT_DIR_TEXT"
cleanup_dir "$OUTPUT_DIR_TEXT"
else
echo "skipping tmp directory cleanup"
fi
}
trap cleanup EXIT
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64 # to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [ "$OVERWRITE_FIXTURES" != "false" ]; then if [ "$OVERWRITE_FIXTURES" != "false" ]; then
# remove folder if it exists # remove folder if it exists

0
test_unstructured_ingest/check-num-dirs-output.sh Normal file → Executable file
View File

View File

0
test_unstructured_ingest/check-num-files-output.sh Normal file → Executable file
View File

View File

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
function cleanup_dir() {
local dir_to_cleanup="${1}"
echo "--- Running cleanup of $dir_to_cleanup ---"
if [ -d "$dir_to_cleanup" ]; then
echo "cleaning up directory: $dir_to_cleanup"
rm -rf "$dir_to_cleanup"
else
echo "$dir_to_cleanup does not exist or is not a directory, skipping deletion"
fi
echo "--- Cleanup done ---"
}

View File

@ -1,12 +1,17 @@
import click
from deltalake import DeltaTable from deltalake import DeltaTable
def run_check(): @click.command()
@click.option("--table-uri", type=str)
def run_check(table_uri):
print(f"Checking contents of table at {table_uri}")
delta_table = DeltaTable( delta_table = DeltaTable(
table_uri="/tmp/delta-table-dest", table_uri=table_uri,
) )
assert len(delta_table.to_pandas()) == 10 assert len(delta_table.to_pandas()) == 10
print("table check complete")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,92 +0,0 @@
[
{
"type": "Title",
"element_id": "0540311f6c077fe8f797080918b8d74b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "My First Heading"
},
{
"type": "Title",
"element_id": "399af454cb1368b8257ed406b430de84",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "My first paragraph."
},
{
"type": "Title",
"element_id": "b4cf0d13edfa976816649971bd640a66",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Some CP1252-specific characters:"
},
{
"type": "UncategorizedText",
"element_id": "ada7c3084f437d31d297f85da3941a55",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
},
"text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯"
},
{
"type": "UncategorizedText",
"element_id": "dda5e8c4d245c1954ecb64e5dfea598d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
},
"text": "°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿"
},
{
"type": "Title",
"element_id": "85df09b375e5813aefa3b5f30c8ddff8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
},
"text": "À\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ"
},
{
"type": "Title",
"element_id": "2726d2569cd7a6cecb79a6e46bb0b2b3",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
},
"text": "Ð\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß"
},
{
"type": "Title",
"element_id": "2b01f3e428520f6e47d8513292688cf6",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 6
},
"text": "à\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï"
},
{
"type": "Title",
"element_id": "5ed256e41bfb169af5f50524b9593a16",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 7
},
"text": "ð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ"
}
]

View File

@ -11,6 +11,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=api-ingest-output OUTPUT_FOLDER_NAME=api-ingest-output
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
local \ local \
--api-key "$UNS_API_KEY" \ --api-key "$UNS_API_KEY" \
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*1p.txt" \ --file-glob "*1p.txt" \
--input-path example-docs --input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=airtable-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm" VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--reprocess \ --reprocess \
--output-dir "$OUTPUT_DIR" --output-dir "$OUTPUT_DIR"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=airtable-large
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
exit 0 exit 0
@ -35,16 +39,16 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
# We are expecting fifteen directories: fourteen bases and the parent directory # We are expecting fifteen directories: fourteen bases and the parent directory
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME" "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
# We are expecting 101 files: 100 tables and the parent directory # We are expecting 101 files: 100 tables and the parent directory
sh "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/ "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
# Test on ingesting a large number of bases # Test on ingesting a large number of bases
for i in {1..12}; do for i in {1..12}; do
var="LARGE_WORKSPACE_BASE_ID_$i" var="LARGE_WORKSPACE_BASE_ID_$i"
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}" "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
done done
# Test on ingesting a table with lots of rows # Test on ingesting a table with lots of rows
sh "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=azure
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
azure \ azure \
--download-dir "$DOWNLOAD_DIR" \ --download-dir "$DOWNLOAD_DIR" \
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--account-name azureunstructured1 \ --account-name azureunstructured1 \
--remote-url abfs://container1/ --remote-url abfs://container1/
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-api
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k # shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \ biomed \
@ -28,4 +32,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--max-request-time 30 \ --max-request-time 30 \
--max-retries 5 \ --max-retries 5 \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-path
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k # shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \ biomed \
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--max-retries 5 \ --max-retries 5 \
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \ --path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=box
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
exit 0 exit 0
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--reprocess \ --reprocess \
--verbose --verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -10,6 +10,10 @@ OUTPUT_FOLDER_NAME=confluence-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0 exit 0
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--api-token "$CONFLUENCE_API_TOKEN" \ --api-token "$CONFLUENCE_API_TOKEN" \
--spaces testteamsp,MFS --spaces testteamsp,MFS
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=confluence-large
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0 exit 0
@ -43,7 +47,7 @@ OUTPUT_SUBFOLDER_NAME=testteamsp1
# Example: # Example:
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large # Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1 # Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME" "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided. # We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
sh "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/ "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/

View File

@ -7,25 +7,26 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table OUTPUT_FOLDER_NAME=delta-table
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=/tmp/delta-table-dest DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
exit 0 exit 0
fi fi
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() { function cleanup() {
if [ -d "$DESTINATION_TABLE" ]; then cleanup_dir "$DESTINATION_TABLE"
echo "cleaning up tmp directory: $DESTINATION_TABLE" cleanup_dir "$OUTPUT_DIR"
rm -rf "$DESTINATION_TABLE"
fi
} }
trap cleanup EXIT trap cleanup EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
delta-table \ delta-table \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--download-dir "$DOWNLOAD_DIR" \ --download-dir "$DOWNLOAD_DIR" \
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \ --table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
--output-dir "$OUTPUT_DIR" \ --output-dir "$OUTPUT_DIR" \
@ -34,8 +35,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose \ --verbose \
delta-table \ delta-table \
--write-column json_data \ --write-column json_data \
--table-uri $DESTINATION_TABLE --table-uri "$DESTINATION_TABLE"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=discord
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$DISCORD_TOKEN" ]; then if [ -z "$DISCORD_TOKEN" ]; then
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
exit 0 exit 0
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--channels 1099442333440802930,1099601456321003600 \ --channels 1099442333440802930,1099601456321003600 \
--token "$DISCORD_TOKEN" \ --token "$DISCORD_TOKEN" \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=dropbox
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url "dropbox:// /" --remote-url "dropbox:// /"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,12 +9,24 @@ OUTPUT_FOLDER_NAME=elasticsearch
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck source=/dev/null # shellcheck disable=SC1091
sh scripts/elasticsearch-test-helpers/create-and-check-es.sh source "$SCRIPT_DIR"/cleanup.sh
wait
# Kill the container so the script can be repeatedly run using the same ports function cleanup() {
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT # Kill the container so the script can be repeatedly run using the same ports
if docker ps --filter "name=es-test"; then
echo "Stopping Elasticsearch Docker container"
docker stop es-test
fi
cleanup_dir "$OUTPUT_DIR"
}
trap cleanup EXIT
# shellcheck source=/dev/null
scripts/elasticsearch-test-helpers/create-and-check-es.sh
wait
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
elasticsearch \ elasticsearch \
@ -29,5 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--url http://localhost:9200 \ --url http://localhost:9200 \
--jq-query '{ethnicity, director, plot}' --jq-query '{ethnicity, director, plot}'
echo "SCRIPT_DIR: $SCRIPT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gcs
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0 exit 0
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url gs://utic-test-ingest-fixtures/ --remote-url gs://utic-test-ingest-fixtures/
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=github
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none}
ACCESS_TOKEN_FLAGS="" ACCESS_TOKEN_FLAGS=""
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--git-file-glob '*.html,*.txt' \ --git-file-glob '*.html,*.txt' \
$ACCESS_TOKEN_FLAGS $ACCESS_TOKEN_FLAGS
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gitlab
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
gitlab \ gitlab \
--download-dir "$DOWNLOAD_DIR" \ --download-dir "$DOWNLOAD_DIR" \
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--git-file-glob '*.md,*.txt' \ --git-file-glob '*.md,*.txt' \
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab
sh "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=google-drive
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
@ -32,4 +36,4 @@ PYTHONPATH=. unstructured/ingest/main.py \
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=jira-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
exit 0 exit 0
@ -52,4 +56,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-with-encoding OUTPUT_FOLDER_NAME=local-single-file-with-encoding
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
local \ local \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -18,4 +22,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
local \ local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file OUTPUT_FOLDER_NAME=local-single-file
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
local \ local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local OUTPUT_FOLDER_NAME=local
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
local \ local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -17,4 +21,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*.html" \ --file-glob "*.html" \
--input-path example-docs --input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=notion
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$NOTION_API_KEY" ]; then if [ -z "$NOTION_API_KEY" ]; then
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
exit 0 exit 0
@ -25,4 +29,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose --verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=onedrive
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
exit 0 exit 0
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--path '/utic-test-ingest-fixtures' \ --path '/utic-test-ingest-fixtures' \
--recursive \ --recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=outlook
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
exit 0 exit 0
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
INPUT_PATH=$SCRIPT_DIR/download INPUT_PATH=$SCRIPT_DIR/download
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
echo "REPROCESS INPUT PATH" echo "REPROCESS INPUT PATH"
ls "$INPUT_PATH" ls "$INPUT_PATH"
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -2,13 +2,18 @@
set -e set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=s3 OUTPUT_FOLDER_NAME=s3
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k # shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
s3 \ s3 \
@ -23,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--anonymous --anonymous
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=salesforce
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
exit 0 exit 0
@ -37,4 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--output-dir "$OUTPUT_DIR" \ --output-dir "$OUTPUT_DIR" \
--verbose --verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=Sharepoint
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 0 exit 0
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--path "Shared Documents" \ --path "Shared Documents" \
--recursive \ --recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=slack
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SLACK_TOKEN" ]; then if [ -z "$SLACK_TOKEN" ]; then
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
exit 0 exit 0
@ -27,4 +31,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--start-date 2023-04-01 \ --start-date 2023-04-01 \
--end-date 2023-04-08T12:00:00-08:00 --end-date 2023-04-08T12:00:00-08:00
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=wikipedia
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
wikipedia \ wikipedia \
--download-dir "$DOWNLOAD_DIR" \ --download-dir "$DOWNLOAD_DIR" \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose \ --verbose \
--page-title "Open Source Software" --page-title "Open Source Software"
sh "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME

View File

@ -1,6 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -eux -o pipefail set -eu -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
@ -8,35 +8,55 @@ cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1 export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-s3.sh scripts=(
./test_unstructured_ingest/test-ingest-azure.sh 'test-ingest-s3.sh'
./test_unstructured_ingest/test-ingest-box.sh 'test-ingest-azure.sh'
./test_unstructured_ingest/test-ingest-discord.sh 'test-ingest-box.sh'
./test_unstructured_ingest/test-ingest-dropbox.sh 'test-ingest-discord.sh'
./test_unstructured_ingest/test-ingest-github.sh 'test-ingest-dropbox.sh'
./test_unstructured_ingest/test-ingest-gitlab.sh 'test-ingest-github.sh'
./test_unstructured_ingest/test-ingest-google-drive.sh 'test-ingest-gitlab.sh'
./test_unstructured_ingest/test-ingest-wikipedia.sh 'test-ingest-google-drive.sh'
./test_unstructured_ingest/test-ingest-biomed-api.sh 'test-ingest-wikipedia.sh'
./test_unstructured_ingest/test-ingest-biomed-path.sh 'test-ingest-biomed-api.sh'
./test_unstructured_ingest/test-ingest-local.sh 'test-ingest-biomed-path.sh'
./test_unstructured_ingest/test-ingest-slack.sh 'test-ingest-local.sh'
./test_unstructured_ingest/test-ingest-against-api.sh 'test-ingest-slack.sh'
./test_unstructured_ingest/test-ingest-gcs.sh 'test-ingest-against-api.sh'
./test_unstructured_ingest/test-ingest-onedrive.sh 'test-ingest-gcs.sh'
./test_unstructured_ingest/test-ingest-outlook.sh 'test-ingest-onedrive.sh'
./test_unstructured_ingest/test-ingest-elasticsearch.sh 'test-ingest-outlook.sh'
./test_unstructured_ingest/test-ingest-confluence-diff.sh 'test-ingest-elasticsearch.sh'
./test_unstructured_ingest/test-ingest-confluence-large.sh 'test-ingest-confluence-diff.sh'
./test_unstructured_ingest/test-ingest-airtable-diff.sh 'test-ingest-confluence-large.sh'
./test_unstructured_ingest/test-ingest-airtable-large.sh 'test-ingest-airtable-diff.sh'
./test_unstructured_ingest/test-ingest-local-single-file.sh 'test-ingest-airtable-large.sh'
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh 'test-ingest-local-single-file.sh'
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh 'test-ingest-local-single-file-with-encoding.sh'
./test_unstructured_ingest/test-ingest-notion.sh 'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
./test_unstructured_ingest/test-ingest-delta-table.sh 'test-ingest-notion.sh'
./test_unstructured_ingest/test-ingest-salesforce.sh 'test-ingest-delta-table.sh'
./test_unstructured_ingest/test-ingest-jira.sh 'test-ingest-salesforce.sh'
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option 'test-ingest-jira.sh'
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh ## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-sharepoint.sh 'test-ingest-pdf-fast-reprocess.sh'
'test-ingest-sharepoint.sh'
)
CURRENT_SCRIPT="none"
function print_last_run() {
if [ "$CURRENT_SCRIPT" != "none" ]; then
echo "Last ran script: $CURRENT_SCRIPT"
fi
}
trap print_last_run EXIT
for script in "${scripts[@]}"; do
CURRENT_SCRIPT=$script
echo "--------- RUNNING SCRIPT $script ---------"
echo "Running ./test_unstructured_ingest/$script"
./test_unstructured_ingest/"$script"
echo "--------- FINISHED SCRIPT $script ---------"
done

View File

@ -1 +1 @@
__version__ = "0.10.17-dev2" # pragma: no cover __version__ = "0.10.17-dev3" # pragma: no cover