mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-30 19:56:41 +00:00

### Description In an effort to speed up the ingest tests, bumping the num if processes to the max on the system for each
35 lines
1.1 KiB
Bash
Executable File
35 lines
1.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# A local connector to process pre-downloaded PDFs under `files-ingest-download` dir with --fast startegy
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=pdf-fast-reprocess
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
INPUT_PATH=$SCRIPT_DIR/download
|
|
max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
|
|
|
echo "REPROCESS INPUT PATH"
|
|
ls "$INPUT_PATH"
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
local \
|
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--num-processes "$max_processes" \
|
|
--strategy fast \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--verbose \
|
|
--file-glob "*.pdf" \
|
|
--input-path "$INPUT_PATH" \
|
|
--recursive
|
|
|
|
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|