| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | #!/usr/bin/env bash
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | set -e | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | SCRIPT_DIR=$(dirname "$(realpath "$0")") | 
					
						
							|  |  |  | cd "$SCRIPT_DIR"/.. || exit 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-17 12:21:15 +07:00
										 |  |  | EVAL_NAME="$1" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | # List all structured outputs to use in this evaluation | 
					
						
							| 
									
										
										
										
											2024-02-17 12:21:15 +07:00
										 |  |  | OUTPUT_ROOT=${2:-${OUTPUT_ROOT:-$SCRIPT_DIR}} | 
					
						
							| 
									
										
										
										
											2023-11-13 12:42:19 -06:00
										 |  |  | OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | mkdir -p "$OUTPUT_DIR" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | if [ "$EVAL_NAME" == "text-extraction" ]; then | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   METRIC_STRATEGY="measure-text-extraction-accuracy-command" | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | elif [ "$EVAL_NAME" == "element-type" ]; then | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   METRIC_STRATEGY="measure-element-type-accuracy-command" | 
					
						
							| 
									
										
										
										
											2024-02-22 17:35:46 +01:00
										 |  |  | elif [ "$EVAL_NAME" == "table-structure" ]; then | 
					
						
							|  |  |  |   METRIC_STRATEGY="measure-table-structure-accuracy-command" | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | else | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   echo "Wrong metric evaluation strategy given. Expected one of [ text-extraction, element-type ]. Got [ $EVAL_NAME ]." | 
					
						
							|  |  |  |   exit 1 | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | fi | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | # Download cct test from s3 | 
					
						
							|  |  |  | BUCKET_NAME=utic-dev-tech-fixtures | 
					
						
							| 
									
										
										
										
											2023-10-27 00:36:36 -04:00
										 |  |  | FOLDER_NAME=small-eval-"$EVAL_NAME" | 
					
						
							| 
									
										
										
										
											2023-11-13 12:42:19 -06:00
										 |  |  | SOURCE_DIR=$OUTPUT_ROOT/gold-standard/$FOLDER_NAME | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | mkdir -p "$SOURCE_DIR" | 
					
						
							|  |  |  | aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$SOURCE_DIR" --recursive --no-sign-request --region us-east-2 | 
					
						
							| 
									
										
										
										
											2023-10-27 00:36:36 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-21 15:04:30 -05:00
										 |  |  | EXPORT_DIR=$OUTPUT_ROOT/metrics-tmp/$EVAL_NAME | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | # shellcheck disable=SC1091 | 
					
						
							|  |  |  | source "$SCRIPT_DIR"/cleanup.sh | 
					
						
							|  |  |  | function cleanup() { | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   cleanup_dir "$SOURCE_DIR" | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | } | 
					
						
							|  |  |  | trap cleanup EXIT | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | # build args | 
					
						
							|  |  |  | function generate_args() { | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   local argtype="$1" | 
					
						
							|  |  |  |   local dirpath="$2" | 
					
						
							|  |  |  |   local list=("${@:3}") | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   local -a args | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   args=("--${argtype}_dir" "$dirpath") | 
					
						
							|  |  |  |   for filename in "${list[@]}"; do | 
					
						
							|  |  |  |     args+=("--${argtype}_list" "$filename") | 
					
						
							|  |  |  |   done | 
					
						
							|  |  |  |   echo "${args[@]}" | 
					
						
							| 
									
										
										
										
											2023-11-01 11:58:23 -04:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # List selected output as a subset of OUTPUT_DIR, if any | 
					
						
							|  |  |  | OUTPUT_LIST=( | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | # List selected source as a subset of SOURCE_DIR, if any | 
					
						
							|  |  |  | SOURCE_LIST=( | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-11 20:04:15 -05:00
										 |  |  | read -ra output_args <<<"$(generate_args "output" "$OUTPUT_DIR" "${OUTPUT_LIST[@]}")" | 
					
						
							|  |  |  | read -ra source_args <<<"$(generate_args "source" "$SOURCE_DIR" "${SOURCE_LIST[@]}")" | 
					
						
							| 
									
										
										
										
											2023-10-27 00:36:36 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-25 10:46:05 -05:00
										 |  |  | # mkdir export_dir is handled in python script | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | PYTHONPATH=. ./unstructured/ingest/evaluate.py \
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   $METRIC_STRATEGY "${output_args[@]}" "${source_args[@]}" \
 | 
					
						
							|  |  |  |   --export_dir "$EXPORT_DIR" | 
					
						
							| 
									
										
										
										
											2023-11-16 00:02:43 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-21 15:04:30 -05:00
										 |  |  | "$SCRIPT_DIR"/check-diff-evaluation-metrics.sh "$EVAL_NAME" |