mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 bd8a74d686
			
		
	
	
		bd8a74d686
		
			
		
	
	
	
	
		
			
			Given the tendency for shell scripts to easily enter into a few levels of indentation and long line lengths, update the default to 2 spaces.
		
			
				
	
	
		
			124 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			124 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env bash
 | |
| 
 | |
| set -u -o pipefail
 | |
| 
 | |
| SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 | |
| SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
 | |
| # If the file already exists, reset it
 | |
| if [ -f "$SKIPPED_FILES_LOG" ]; then
 | |
|   rm "$SKIPPED_FILES_LOG"
 | |
| fi
 | |
| touch "$SKIPPED_FILES_LOG"
 | |
| cd "$SCRIPT_DIR"/.. || exit 1
 | |
| 
 | |
| # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
 | |
| export OMP_THREAD_LIMIT=1
 | |
| 
 | |
| all_tests=(
 | |
|   's3.sh'
 | |
|   's3-minio.sh'
 | |
|   'azure.sh'
 | |
|   'biomed-api.sh'
 | |
|   'biomed-path.sh'
 | |
|   # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
 | |
|   'pdf-fast-reprocess.sh'
 | |
|   'salesforce.sh'
 | |
|   'box.sh'
 | |
|   'discord.sh'
 | |
|   'dropbox.sh'
 | |
|   'github.sh'
 | |
|   'gitlab.sh'
 | |
|   'google-drive.sh'
 | |
|   'wikipedia.sh'
 | |
|   'local.sh'
 | |
|   'slack.sh'
 | |
|   'against-api.sh'
 | |
|   'gcs.sh'
 | |
|   'onedrive.sh'
 | |
|   'outlook.sh'
 | |
|   'elasticsearch.sh'
 | |
|   'confluence-diff.sh'
 | |
|   'confluence-large.sh'
 | |
|   'airtable-diff.sh'
 | |
|   # NOTE(ryan): This test is disabled because it is triggering too many requests to the API
 | |
|   # 'airtable-large.sh'
 | |
|   'local-single-file.sh'
 | |
|   'local-single-file-with-encoding.sh'
 | |
|   'local-single-file-with-pdf-infer-table-structure.sh'
 | |
|   'notion.sh'
 | |
|   'delta-table.sh'
 | |
|   'jira.sh'
 | |
|   'sharepoint.sh'
 | |
|   'sharepoint-with-permissions.sh'
 | |
|   'hubspot.sh'
 | |
|   'local-embed.sh'
 | |
|   'sftp.sh'
 | |
| )
 | |
| 
 | |
| full_python_matrix_tests=(
 | |
|   'sharepoint.sh'
 | |
|   'local.sh'
 | |
|   'local-single-file.sh'
 | |
|   'local-single-file-with-encoding.sh'
 | |
|   'local-single-file-with-pdf-infer-table-structure.sh'
 | |
|   's3.sh'
 | |
|   'google-drive.sh'
 | |
|   'gcs.sh'
 | |
|   'azure.sh'
 | |
| )
 | |
| 
 | |
| CURRENT_TEST="none"
 | |
| 
 | |
| function print_last_run() {
 | |
|   if [ "$CURRENT_TEST" != "none" ]; then
 | |
|     echo "Last ran script: $CURRENT_TEST"
 | |
|   fi
 | |
|   echo "######## SKIPPED TESTS: ########"
 | |
|   cat "$SKIPPED_FILES_LOG"
 | |
| }
 | |
| 
 | |
| trap print_last_run EXIT
 | |
| 
 | |
| python_version=$(python --version 2>&1)
 | |
| 
 | |
| tests_to_ignore=(
 | |
|   'notion.sh'
 | |
|   'dropbox.sh'
 | |
| )
 | |
| 
 | |
| for test in "${all_tests[@]}"; do
 | |
|   CURRENT_TEST="$test"
 | |
|   # IF: python_version is not 3.10 (wildcarded to match any subminor version) AND the current test is not in full_python_matrix_tests
 | |
|   # Note: to test we expand the full_python_matrix_tests array to a string and then regex match the current test
 | |
|   if [[ "$python_version" != "Python 3.10"* ]] && [[ ! "${full_python_matrix_tests[*]}" =~ $test ]]; then
 | |
|     echo "--------- SKIPPING SCRIPT $test ---------"
 | |
|     continue
 | |
|   fi
 | |
|   echo "--------- RUNNING SCRIPT $test ---------"
 | |
|   echo "Running ./test_unstructured_ingest/$test"
 | |
|   ./test_unstructured_ingest/src/"$test"
 | |
|   rc=$?
 | |
|   if [[ $rc -eq 8 ]]; then
 | |
|     echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG"
 | |
|   elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then
 | |
|     echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG"
 | |
|     continue
 | |
|   elif [[ $rc -ne 0 ]]; then
 | |
|     exit $rc
 | |
|   fi
 | |
|   echo "--------- FINISHED SCRIPT $test ---------"
 | |
| done
 | |
| 
 | |
| set +e
 | |
| 
 | |
| all_eval=(
 | |
|   'text-extraction'
 | |
|   'element-type'
 | |
| )
 | |
| for eval in "${all_eval[@]}"; do
 | |
|   CURRENT_TEST="evaluation-metrics.sh $eval"
 | |
|   echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
 | |
|   ./test_unstructured_ingest/evaluation-metrics.sh "$eval"
 | |
|   echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
 | |
| done
 |