Klaijan 6707cab250
build: text extraction evaluation metrics workflow added (#1757)
**Executive Summary**
This PR adds the evaluation metrics to our current workflow. It verifies
the flow that when the code is pushed, the code will gets evaluate
against our gold standard and output into `.tsv` file.

**Technical Details**
- Adds evaluation metrics to the test-ingest workflow
- Make use of `structured-output` from `test-ingest` and compare to the
gold-standard uploaded in s3, and download into local when make
comparison. The current folder in-use is
`s3://utic-dev-tech-fixtures/small-cct`. This dir is editable in the
shell script.
- With this PR, only one file from one connector is use to compare.

**Misc**
- Not many overlapped files between test-ingest and gold-standard. More
files will be added.

**Outputs**
2 `.tsv` files are saved under `test_unstructured_ingest/metrics/`.


![image](https://github.com/Unstructured-IO/unstructured/assets/2177850/222e437c-1a94-4d7c-9320-81696633b1ae)


![image](https://github.com/Unstructured-IO/unstructured/assets/2177850/5c840322-6739-4634-8868-eba04b4ebc96)

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-10-23 21:39:22 +00:00

103 lines
3.1 KiB
Bash
Executable File

#!/usr/bin/env bash
set -eu -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1
all_tests=(
'test-ingest-s3.sh'
'test-ingest-s3-minio.sh'
'test-ingest-azure.sh'
'test-ingest-biomed-api.sh'
'test-ingest-biomed-path.sh'
# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
'test-ingest-pdf-fast-reprocess.sh'
'test-ingest-salesforce.sh'
'test-ingest-box.sh'
'test-ingest-discord.sh'
'test-ingest-dropbox.sh'
'test-ingest-github.sh'
'test-ingest-gitlab.sh'
'test-ingest-google-drive.sh'
'test-ingest-wikipedia.sh'
'test-ingest-local.sh'
'test-ingest-slack.sh'
'test-ingest-against-api.sh'
'test-ingest-gcs.sh'
'test-ingest-onedrive.sh'
'test-ingest-outlook.sh'
'test-ingest-elasticsearch.sh'
'test-ingest-confluence-diff.sh'
'test-ingest-confluence-large.sh'
'test-ingest-airtable-diff.sh'
# NOTE(ryan): This test is disabled because it is triggering too many requests to the API
# 'test-ingest-airtable-large.sh'
'test-ingest-local-single-file.sh'
'test-ingest-local-single-file-with-encoding.sh'
'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
'test-ingest-notion.sh'
'test-ingest-delta-table.sh'
'test-ingest-jira.sh'
'test-ingest-sharepoint.sh'
'test-ingest-embed.sh'
)
full_python_matrix_tests=(
'test-ingest-sharepoint.sh'
'test-ingest-local.sh'
'test-ingest-local-single-file.sh'
'test-ingest-local-single-file-with-encoding.sh'
'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
'test-ingest-s3.sh'
'test-ingest-google-drive.sh'
'test-ingest-gcs.sh'
)
CURRENT_TEST="none"
function print_last_run() {
if [ "$CURRENT_TEST" != "none" ]; then
echo "Last ran script: $CURRENT_TEST"
fi
}
trap print_last_run EXIT
python_version=$(python --version 2>&1)
tests_to_ignore=(
'test-ingest-notion.sh'
'test-ingest-dropbox.sh'
)
for test in "${all_tests[@]}"; do
CURRENT_TEST="$test"
# IF: python_version is not 3.10 (wildcarded to match any subminor version) AND the current test is not in full_python_matrix_tests
# Note: to test we expand the full_python_matrix_tests array to a string and then regex match the current test
if [[ "$python_version" != "Python 3.10"* ]] && [[ ! "${full_python_matrix_tests[*]}" =~ $test ]] ; then
echo "--------- SKIPPING SCRIPT $test ---------"
continue
fi
if [[ "${tests_to_ignore[*]}" =~ $test ]]; then
echo "--------- RUNNING SCRIPT $test --- IGNORING FAILURES"
set +e
echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/"$test"
set -e
echo "--------- FINISHED SCRIPT $test ---------"
else
echo "--------- RUNNING SCRIPT $test ---------"
echo "Running ./test_unstructured_ingest/$test"
./test_unstructured_ingest/"$test"
echo "--------- FINISHED SCRIPT $test ---------"
fi
done
echo "--------- RUNNING SCRIPT evaluation-metrics.sh ---------"
./test_unstructured_ingest/evaluation-metrics.sh
echo "--------- FINISHED SCRIPT evaluation-metrics.sh ---------"