53 lines
1.6 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -e
DEST_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$DEST_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=s3-dest
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DESTINATION_S3="s3://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(uuidgen)/"
CI=${CI:-"false"}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if aws s3 ls "$DESTINATION_S3" --region us-east-2; then
echo "deleting destination s3 location: $DESTINATION_S3"
aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2
fi
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
s3 \
--anonymous \
--remote-url "$DESTINATION_S3"
# Simply check the number of files uploaded
expected_num_files=1
fix: local connector with input path to single file (#2116) When passed an absolute file path for the input document path, the local connector incorrectly writes the output file to the wrong directory. Also, in the single file input path cases we are currently including parent path as part of the destination writing, instead when a single file is specified as input the output file should be located directly in the specified outputs directory. Note: this change meant that we needed to bump the file path of some expected results. This fixes such that the output in this case is written to `output-dir/input-filename.json`. ## Changes - Fix for incorrect output path of files partitioned via the local connector when the input path is a file path (rather than directory) - Updated single-local-file test to validate the flow where we specify an absolute file path (since this was particularly broken) ## Testing Note: running the updated `local-single-file` test without the changes to the local connector will result in a final output copy of: ``` Copying /Users/ryannikolaidis/Development/unstructured/unstructured/test_unstructured_ingest/workdir/local-single-file/partitioned/a48c2abec07a9a31860429f94e5a6ade.json -> /Users/ryannikolaidis/Development/unstructured/unstructured/test_unstructured_ingest/../example-docs/language-docs/UDHR_first_article_all.txt.json ``` where the output path is the input path and not the expected `output-dir/input-filename.json` Running with this change we can now expect the file at that directory. --------- Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
2023-11-19 10:21:31 -08:00
num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}" --region us-east-2 | grep -c "\.json$")
if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then
echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files."
exit 1
else
echo "Expected number of files found: $num_files_in_s3/$expected_num_files"
fi