diff --git a/CHANGELOG.md b/CHANGELOG.md index c8c9e5fcb..36bfc07ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ * **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images. - ## 0.10.30 ### Enhancements @@ -23,8 +22,8 @@ ### Fixes -* **Fix logic that determines pdf auto strategy.** Previously, `_determine_pdf_auto_strategy` returned `hi_res` strategy only if `infer_table_structure` was true. It now returns the `hi_res` strategy if either `infer_table_structure` or `extract_images_in_pdf` is true. -* **Fix invalid coordinates when parsing tesseract ocr data.** Previously, when parsing tesseract ocr data, the ocr data had invalid bboxes if zoom was set to `0`. A logical check is now added to avoid such error. +* **Fix logic that determines pdf auto strategy.** Previously, `_determine_pdf_auto_strategy` returned `hi_res` strategy only if `infer_table_structure` was true. It now returns the `hi_res` strategy if either `infer_table_structure` or `extract_images_in_pdf` is true. +* **Fix invalid coordinates when parsing tesseract ocr data.** Previously, when parsing tesseract ocr data, the ocr data had invalid bboxes if zoom was set to `0`. A logical check is now added to avoid such error. * **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api. * **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present. * **Support tables that contain only numbers when partitioning via `ocr_only`** Tables that contain only numbers are returned as floats in a pandas.DataFrame when the image is converted from `.image_to_data()`. An AttributeError was raised downstream when trying to `.strip()` the floats. diff --git a/test_unstructured_ingest/check-diff-expected-output.sh b/test_unstructured_ingest/check-diff-expected-output.sh index e9e9a9092..009e440e9 100755 --- a/test_unstructured_ingest/check-diff-expected-output.sh +++ b/test_unstructured_ingest/check-diff-expected-output.sh @@ -16,10 +16,11 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true} OUTPUT_FOLDER_NAME=$1 -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME -EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME -EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME +OUTPUT_DIR_TEXT=$OUTPUT_ROOT/text-output/$OUTPUT_FOLDER_NAME +EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME +EXPECTED_OUTPUT_DIR_TEXT=$OUTPUT_ROOT/expected-text-output/$OUTPUT_FOLDER_NAME # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/check-num-dirs-output.sh b/test_unstructured_ingest/check-num-dirs-output.sh index d4a98aea3..b20242cd8 100755 --- a/test_unstructured_ingest/check-num-dirs-output.sh +++ b/test_unstructured_ingest/check-num-dirs-output.sh @@ -11,7 +11,8 @@ set +e EXPECTED_NUM_DIRS=$1 OUTPUT_FOLDER_NAME=$2 SCRIPT_DIR=$(dirname "$(realpath "$0")") -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME NUMBER_OF_FOUND_DIRS="$(find "$OUTPUT_DIR" -type d -exec printf '.' \; | wc -c | xargs)" diff --git a/test_unstructured_ingest/check-num-files-expected-output.sh b/test_unstructured_ingest/check-num-files-expected-output.sh index ed5fc1bab..e84aad72d 100755 --- a/test_unstructured_ingest/check-num-files-expected-output.sh +++ b/test_unstructured_ingest/check-num-files-expected-output.sh @@ -13,7 +13,8 @@ EXPECTED_NUM_FILES=$1 OUTPUT_FOLDER_NAME=$2 EXPECTED_SIZE=$3 SCRIPT_DIR=$(dirname "$(realpath "$0")") -EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME NUM_FILES=$(find "$EXPECTED_OUTPUT_DIR" -type f -size +"$EXPECTED_SIZE" | wc -l) # Note: single brackets and "-ne" operator were necessary for evaluation in CI diff --git a/test_unstructured_ingest/check-num-files-output.sh b/test_unstructured_ingest/check-num-files-output.sh index 81b9b25ca..b3cc97fed 100755 --- a/test_unstructured_ingest/check-num-files-output.sh +++ b/test_unstructured_ingest/check-num-files-output.sh @@ -11,7 +11,8 @@ set +e EXPECTED_NUM_FILES=$1 OUTPUT_FOLDER_NAME=$2 SCRIPT_DIR=$(dirname "$(realpath "$0")") -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" # Note: single brackets and "-ne" operator were necessary for evaluation in CI diff --git a/test_unstructured_ingest/evaluation-ingest-cp.sh b/test_unstructured_ingest/evaluation-ingest-cp.sh index 46c78d101..c1c6fafc9 100755 --- a/test_unstructured_ingest/evaluation-ingest-cp.sh +++ b/test_unstructured_ingest/evaluation-ingest-cp.sh @@ -9,16 +9,17 @@ OUTPUT_DIR=$1 OUTPUT_FOLDER_NAME=$2 structured_outputs=("$OUTPUT_DIR"/*) -CP_DIR=$SCRIPT_DIR/structured-output-eval/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +CP_DIR=$OUTPUT_ROOT/structured-output-eval/$OUTPUT_FOLDER_NAME mkdir -p "$CP_DIR" selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt") -# If structured output file in this connector's outputs match the +# If structured output file in this connector's outputs match the # selected outputs in the txt file, copy to the destination for file in "${structured_outputs[@]}"; do if [[ -f "$file" && "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then echo "--- Copying $file to $CP_DIR ---" cp "$file" "$CP_DIR" fi -done +done diff --git a/test_unstructured_ingest/evaluation-metrics.sh b/test_unstructured_ingest/evaluation-metrics.sh index 7b2eadfd1..a325e06ec 100755 --- a/test_unstructured_ingest/evaluation-metrics.sh +++ b/test_unstructured_ingest/evaluation-metrics.sh @@ -6,7 +6,8 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 # List all structured outputs to use in this evaluation -OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval mkdir -p "$OUTPUT_DIR" EVAL_NAME="$1" @@ -23,7 +24,7 @@ fi # Download cct test from s3 BUCKET_NAME=utic-dev-tech-fixtures FOLDER_NAME=small-eval-"$EVAL_NAME" -SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME +SOURCE_DIR=$OUTPUT_ROOT/gold-standard/$FOLDER_NAME mkdir -p "$SOURCE_DIR" aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$SOURCE_DIR" --recursive --no-sign-request --region us-east-2 @@ -64,4 +65,4 @@ read -ra source_args <<< "$(generate_args "source" "$SOURCE_DIR" "${SOURCE_LIST[ PYTHONPATH=. ./unstructured/ingest/evaluate.py \ $METRIC_STRATEGY "${output_args[@]}" "${source_args[@]}" \ - --export_dir "$EXPORT_DIR" \ No newline at end of file + --export_dir "$EXPORT_DIR" diff --git a/test_unstructured_ingest/test-ingest-delta-table-dest.sh b/test_unstructured_ingest/test-ingest-delta-table-dest.sh index 18b3001ef..f4703db0c 100755 --- a/test_unstructured_ingest/test-ingest-delta-table-dest.sh +++ b/test_unstructured_ingest/test-ingest-delta-table-dest.sh @@ -5,10 +5,11 @@ set -e SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=delta-table-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest +DESTINATION_TABLE=$OUTPUT_ROOT/delta-table-dest CI=${CI:-"false"} if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then