mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-16 21:45:54 +00:00

### Description In an effort to mitigate resource consumption when running CI tests, cleanup download dir for ingest tests after each one.
42 lines
1.2 KiB
Bash
Executable File
42 lines
1.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# A local connector to process pre-downloaded PDFs under `files-ingest-download` dir with --fast startegy
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=pdf-fast-reprocess
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
INPUT_PATH=$SCRIPT_DIR/download
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
CI=${CI:-"false"}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$INPUT_PATH"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
echo "REPROCESS INPUT PATH"
|
|
ls "$INPUT_PATH"
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
local \
|
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--num-processes "$max_processes" \
|
|
--strategy fast \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--verbose \
|
|
--file-glob "*.pdf" \
|
|
--input-path "$INPUT_PATH" \
|
|
--recursive
|
|
|
|
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|