| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | #!/usr/bin/env bash
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | set -e | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-01 15:23:44 -04:00
										 |  |  | DEST_PATH=$(dirname "$(realpath "$0")") | 
					
						
							|  |  |  | SCRIPT_DIR=$(dirname "$DEST_PATH") | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | cd "$SCRIPT_DIR"/.. || exit 1 | 
					
						
							|  |  |  | OUTPUT_FOLDER_NAME=azure-dest | 
					
						
							| 
									
										
										
										
											2023-11-02 16:41:56 -05:00
										 |  |  | OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} | 
					
						
							|  |  |  | OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME | 
					
						
							|  |  |  | WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." | 
					
						
							|  |  |  |   exit 8 | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | fi | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CONTAINER=utic-ingest-test-fixtures-output | 
					
						
							| 
									
										
										
										
											2023-11-07 15:14:01 -08:00
										 |  |  | DIRECTORY=$(uuidgen) | 
					
						
							| 
									
										
										
										
											2023-11-16 12:13:46 -08:00
										 |  |  | REMOTE_URL_RAW="$CONTAINER/$DIRECTORY/" | 
					
						
							|  |  |  | REMOTE_URL="abfs://$REMOTE_URL_RAW" | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | # shellcheck disable=SC1091 | 
					
						
							|  |  |  | source "$SCRIPT_DIR"/cleanup.sh | 
					
						
							|  |  |  | function cleanup() { | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   cleanup_dir "$OUTPUT_DIR" | 
					
						
							|  |  |  |   cleanup_dir "$WORK_DIR" | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   python "$SCRIPT_DIR"/python/test-azure-output.py down \
 | 
					
						
							|  |  |  |     --connection-string "$AZURE_DEST_CONNECTION_STR" \
 | 
					
						
							|  |  |  |     --container "$CONTAINER" \
 | 
					
						
							|  |  |  |     --blob-path "$DIRECTORY" | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | trap cleanup EXIT | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-02 16:41:56 -05:00
										 |  |  | RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} | 
					
						
							|  |  |  | PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   local \
 | 
					
						
							|  |  |  |   --num-processes "$max_processes" \
 | 
					
						
							|  |  |  |   --output-dir "$OUTPUT_DIR" \
 | 
					
						
							|  |  |  |   --strategy fast \
 | 
					
						
							|  |  |  |   --verbose \
 | 
					
						
							|  |  |  |   --reprocess \
 | 
					
						
							|  |  |  |   --input-path example-docs/fake-memo.pdf \
 | 
					
						
							|  |  |  |   --work-dir "$WORK_DIR" \
 | 
					
						
							|  |  |  |   azure \
 | 
					
						
							|  |  |  |   --overwrite \
 | 
					
						
							|  |  |  |   --remote-url "$REMOTE_URL" \
 | 
					
						
							|  |  |  |   --connection-string "$AZURE_DEST_CONNECTION_STR" | 
					
						
							| 
									
										
										
										
											2023-10-30 16:09:49 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Simply check the number of files uploaded | 
					
						
							| 
									
										
										
										
											2023-11-16 12:13:46 -08:00
										 |  |  | python "$SCRIPT_DIR"/python/test-azure-output.py check \
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   --expected-files 1 \
 | 
					
						
							|  |  |  |   --connection-string "$AZURE_DEST_CONNECTION_STR" \
 | 
					
						
							|  |  |  |   --container "$CONTAINER" \
 | 
					
						
							|  |  |  |   --blob-path "$DIRECTORY" |