mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-10 23:54:23 +00:00

When the v2 fsspec connectors currently generate the relative path, they may introduce a path with a leading slash (this happens in the case of the Box connector, which is a subclass of fsspec). When this happens this results in the paths unintentionally being treated as absolute paths. As a result, the ingest pipeline attempts to write files to directories at root level, which in turn raises permission issues. Note: Box expected results needed to update now that it's no longer failing. Aside: found that our tests were unintentionally skipping `box.sh` tests because we were intending to skip `dropbox.sh` and we use regex to match if a given test is in skip tests. This adds changes to force an exact match. ## Changes * Strip leading slashes during the creating of relative paths in fsspec connectors * Add expected results for Box connector * (bonus): `make tidy` altered an unrelated file by removing an unnecessary call of `pass` * (bonus): check exact match for skipped ingest tests which fixes Box tests getting skipped ## Testing [Tests](https://github.com/Unstructured-IO/unstructured/actions/runs/9461928289/job/26093475612#step:7:2085) for the Box connector was failing. It was accidentally getting skipped (see changes above). It is now no longer skipped and passing.
144 lines
3.7 KiB
Bash
Executable File
144 lines
3.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -u -o pipefail
|
|
|
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
|
|
# If the file already exists, reset it
|
|
if [ -f "$SKIPPED_FILES_LOG" ]; then
|
|
rm "$SKIPPED_FILES_LOG"
|
|
fi
|
|
touch "$SKIPPED_FILES_LOG"
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR}
|
|
|
|
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
|
export OMP_THREAD_LIMIT=1
|
|
|
|
all_tests=(
|
|
's3.sh'
|
|
's3-minio.sh'
|
|
'azure.sh'
|
|
'biomed-api.sh'
|
|
'biomed-path.sh'
|
|
# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
|
|
'pdf-fast-reprocess.sh'
|
|
'salesforce.sh'
|
|
'box.sh'
|
|
'discord.sh'
|
|
'dropbox.sh'
|
|
'github.sh'
|
|
'gitlab.sh'
|
|
'google-drive.sh'
|
|
'wikipedia.sh'
|
|
'local.sh'
|
|
'slack.sh'
|
|
'against-api.sh'
|
|
'gcs.sh'
|
|
'onedrive.sh'
|
|
'outlook.sh'
|
|
'elasticsearch.sh'
|
|
'confluence-diff.sh'
|
|
'confluence-large.sh'
|
|
'airtable-diff.sh'
|
|
# # NOTE(ryan): This test is disabled because it is triggering too many requests to the API
|
|
# 'airtable-large.sh'
|
|
'local-single-file.sh'
|
|
'local-single-file-basic-chunking.sh'
|
|
'local-single-file-chunk-no-orig-elements.sh'
|
|
'local-single-file-with-encoding.sh'
|
|
'local-single-file-with-pdf-infer-table-structure.sh'
|
|
'notion.sh'
|
|
'delta-table.sh'
|
|
'jira.sh'
|
|
'sharepoint.sh'
|
|
'sharepoint-with-permissions.sh'
|
|
'hubspot.sh'
|
|
'local-embed.sh'
|
|
'local-embed-bedrock.sh'
|
|
'local-embed-octoai.sh'
|
|
'local-embed-vertexai.sh'
|
|
'local-embed-voyageai.sh'
|
|
'sftp.sh'
|
|
'opensearch.sh'
|
|
'mongodb.sh'
|
|
)
|
|
|
|
full_python_matrix_tests=(
|
|
'sharepoint.sh'
|
|
'local.sh'
|
|
'local-single-file.sh'
|
|
'local-single-file-with-encoding.sh'
|
|
'local-single-file-with-pdf-infer-table-structure.sh'
|
|
's3.sh'
|
|
'google-drive.sh'
|
|
'gcs.sh'
|
|
'azure.sh'
|
|
)
|
|
|
|
CURRENT_TEST="none"
|
|
|
|
function print_last_run() {
|
|
if [ "$CURRENT_TEST" != "none" ]; then
|
|
echo "Last ran script: $CURRENT_TEST"
|
|
fi
|
|
echo "######## SKIPPED TESTS: ########"
|
|
cat "$SKIPPED_FILES_LOG"
|
|
}
|
|
|
|
trap print_last_run EXIT
|
|
|
|
python_version=$(python --version 2>&1)
|
|
|
|
tests_to_ignore=(
|
|
'notion.sh'
|
|
'dropbox.sh'
|
|
)
|
|
|
|
for test in "${all_tests[@]}"; do
|
|
CURRENT_TEST="$test"
|
|
# IF: python_version is not 3.10 (wildcarded to match any subminor version) AND the current test is not in full_python_matrix_tests
|
|
# Note: to test we expand the full_python_matrix_tests array to a string and then regex match the current test
|
|
if [[ "$python_version" != "Python 3.10"* ]] && [[ ! "${full_python_matrix_tests[*]}" =~ $test ]]; then
|
|
echo "--------- SKIPPING SCRIPT $test ---------"
|
|
continue
|
|
fi
|
|
echo "--------- RUNNING SCRIPT $test ---------"
|
|
echo "Running ./test_unstructured_ingest/$test"
|
|
./test_unstructured_ingest/src/"$test"
|
|
rc=$?
|
|
if [[ $rc -eq 8 ]]; then
|
|
echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG"
|
|
else
|
|
# Check if the test is in tests_to_ignore
|
|
ignore_test=false
|
|
for ignore in "${tests_to_ignore[@]}"; do
|
|
if [[ "$ignore" == "$test" ]]; then
|
|
ignore_test=true
|
|
break
|
|
fi
|
|
done
|
|
if $ignore_test; then
|
|
echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG"
|
|
continue
|
|
elif [[ $rc -ne 0 ]]; then
|
|
exit $rc
|
|
fi
|
|
fi
|
|
echo "--------- FINISHED SCRIPT $test ---------"
|
|
done
|
|
|
|
set +e
|
|
|
|
all_eval=(
|
|
'text-extraction'
|
|
'element-type'
|
|
)
|
|
for eval in "${all_eval[@]}"; do
|
|
CURRENT_TEST="evaluation-metrics.sh $eval"
|
|
echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
|
|
./test_unstructured_ingest/evaluation-metrics.sh "$eval" "$EVAL_OUTPUT_ROOT"
|
|
echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
|
|
done
|