mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-15 13:05:54 +00:00

### Description In an effort to mitigate resource consumption when running CI tests, cleanup download dir for ingest tests after each one.
46 lines
1.5 KiB
Bash
Executable File
46 lines
1.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
# Description: This test checks if all the processed content is the same as the expected outputs.
|
|
# Also checks if a large table can be ingested properly.
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
OUTPUT_FOLDER_NAME=airtable-diff
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
CI=${CI:-"false"}
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$DOWNLOAD_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
|
|
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
|
|
|
|
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
|
|
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
|
|
exit 0
|
|
fi
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
airtable \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
|
|
--list-of-paths "$VARIED_DATA_BASE_ID $VARIED_DATA_BASE_ID_2" \
|
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.date,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--num-processes "$max_processes" \
|
|
--preserve-downloads \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR"
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|