mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-30 17:38:13 +00:00 
			
		
		
		
	test: adds profiling script (#661)
This commit is contained in:
		
							parent
							
								
									c35fff2972
								
							
						
					
					
						commit
						bdef4fd398
					
				
							
								
								
									
										3
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							| @ -31,11 +31,10 @@ jobs: | ||||
|       with: | ||||
|         python-version: ${{ matrix.python-version }} | ||||
|     - name: Setup virtual environment (no cache hit) | ||||
|       if: steps.virtualenv-cache.outputs.cache-hit != 'true' | ||||
|       run: | | ||||
|         python${{ matrix.python-version }} -m venv .venv | ||||
|         source .venv/bin/activate | ||||
|         mkdir "$NLTK_DATA" | ||||
|         [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" | ||||
|         make install-ci | ||||
| 
 | ||||
|   check-deps: | ||||
|  | ||||
							
								
								
									
										3
									
								
								scripts/performance/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								scripts/performance/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,3 @@ | ||||
| docs | ||||
| benchmark_results | ||||
| profile_results | ||||
							
								
								
									
										39
									
								
								scripts/performance/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								scripts/performance/README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | ||||
| # Performance | ||||
| This is a collection of tools helpful for inspecting and tracking performance of the Unstructured library.  | ||||
| 
 | ||||
| The benchmarking script allows a user to track performance time to partitioning results against a fixed set of test documents and store those results with indication of architecture, instance type, and git hash, in S3. | ||||
| 
 | ||||
| The profiling script allows a user to inspect how time time and memory are spent across called functions when performing partitioning on a given document. | ||||
| 
 | ||||
| ## Install | ||||
| Benchmarking requires no additional dependencies and should work without any initial setup. | ||||
| Profiling has a few dependencies which can be installed with:  | ||||
| `pip install -r scripts/performance/requirements.txt` | ||||
| 
 | ||||
| Before running a test either populate the `docs` directory with test documents of interest or set environment variable SYNC_S3_DOCS=true | ||||
| 
 | ||||
| ## Run | ||||
| ### Benchmark | ||||
| Export / assign desired environment variable settings: | ||||
| - DOCKER_TEST: Set to true to run benchmark inside a Docker container (default: false) | ||||
| - NUM_ITERATIONS: Number of iterations for benchmark (e.g., 100) (default: 3) | ||||
| - INSTANCE_TYPE: Type of benchmark instance (e.g., "c5.xlarge") (default: unspecified) | ||||
| - PUBLISH_RESULTS: Set to true to publish results to S3 bucket (default: false) | ||||
| - SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false) | ||||
| -  | ||||
| Usage: `./scripts/performance/benchmark.sh` | ||||
| 
 | ||||
| ### Profile | ||||
| 
 | ||||
| Export / assign desired environment variable settings: | ||||
| - SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false) | ||||
| - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false) | ||||
| 
 | ||||
| Usage: `./scripts/performance/profile.sh` | ||||
| - Run the script and choose the profiling mode: 'run' or 'view'. | ||||
| - In the 'run' mode, you can profile custom files or select existing test files. | ||||
| - In the 'view' mode, you can view previously generated profiling results. | ||||
| - The script supports time profiling with cProfile and memory profiling with memray. | ||||
| - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics. | ||||
| - Test documents are synced from an S3 bucket to a local directory before running the profiles | ||||
| 
 | ||||
							
								
								
									
										364
									
								
								scripts/performance/profile.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										364
									
								
								scripts/performance/profile.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,364 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| # Performance profiling and visualization of code using cProfile and memray. | ||||
| 
 | ||||
| # Environment Variables: | ||||
| #   - SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false) | ||||
| #   - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false) | ||||
| 
 | ||||
| # Usage:  | ||||
| # - Run the script and choose the profiling mode: 'run' or 'view'. | ||||
| # - In the 'run' mode, you can profile custom files or select existing test files. | ||||
| # - In the 'view' mode, you can view previously generated profiling results. | ||||
| # - The script supports time profiling with cProfile and memory profiling with memray. | ||||
| # - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics. | ||||
| # - Test documents are (optionally) synced from an S3 bucket to a local directory before running the profiles. | ||||
| 
 | ||||
| # Dependencies: | ||||
| # - memray package for memory profiling and visualization. | ||||
| # - flameprof and snakeviz for time profiling and visualization. | ||||
| # - AWS CLI for syncing files from S3 (if applicable). | ||||
| 
 | ||||
| # Package dependencies can be installed with `pip install -r scripts/performance/requirements.txt` | ||||
| 
 | ||||
| # Usage example: | ||||
| # ./scripts/performance/profile.sh | ||||
| 
 | ||||
| # NOTE: because memray does not build wheels for ARM-Linux, this script can not run in an ARM Docker container on an M1 Mac (though emulated AMD would work). | ||||
| 
 | ||||
| 
 | ||||
| # Validate dependencies | ||||
| check_python_module() { | ||||
|   if ! python3 -c "import $1" >/dev/null 2>&1; then | ||||
|     echo "Error: Python module $1 is not installed. Please install required depencies with 'pip install -r scripts/performance/requirements.txt'." | ||||
|     exit 1 | ||||
|   fi | ||||
| } | ||||
| validate_dependencies() { | ||||
|   check_python_module memray  | ||||
|   check_python_module flameprof | ||||
| } | ||||
| 
 | ||||
| # only validate in non-docker context (since we install dependencies on the fly in docker) | ||||
| if [[ "$DOCKER_TEST" != "true" ]]; then | ||||
|   validate_dependencies | ||||
| fi | ||||
| 
 | ||||
| SCRIPT_DIR=$(dirname "$0") | ||||
| # Convert the relative path to module notation | ||||
| MODULE_PATH=${SCRIPT_DIR////.} | ||||
| # Remove the leading dot if it exists | ||||
| MODULE_PATH=${MODULE_PATH#.} | ||||
| # Remove the leading dot if it exists again | ||||
| MODULE_PATH=${MODULE_PATH#\.} | ||||
| 
 | ||||
| PROFILE_RESULTS_DIR="$SCRIPT_DIR/profile_results" | ||||
| 
 | ||||
| S3_BUCKET="utic-dev-tech-fixtures" | ||||
| S3_DOCS_DIR="performance-test/docs" | ||||
| 
 | ||||
| # Create PROFILE_RESULTS_DIR if it doesn't exist | ||||
| mkdir -p "$PROFILE_RESULTS_DIR" | ||||
| 
 | ||||
| if [[ "$SYNC_S3_DOCS" == "true" ]]; then | ||||
|   # Sync files from S3 to the local "docs" directory | ||||
|   aws s3 sync "s3://$S3_BUCKET/$S3_DOCS_DIR" "$SCRIPT_DIR/docs" | ||||
| fi | ||||
| 
 | ||||
| if [[ "$DOCKER_TEST" == "true" ]]; then | ||||
|   SCRIPT_PARENT_DIR=$(dirname "$(dirname "$(realpath "$0")")") | ||||
|   docker run -it --rm -v "$SCRIPT_PARENT_DIR:/home/unstructured/scripts" unstructured:dev /bin/bash -c " | ||||
|   cd unstructured/ | ||||
|   pip install -r scripts/performance/requirements.txt | ||||
|   echo \"Warming the Docker container by running a small partitioning job..\" | ||||
|   python3 -c 'from unstructured.partition.auto import partition; partition(\"'""$SCRIPT_DIR/warmup.pdf'\", strategy=\"hi_res\")[1]' | ||||
|   ./scripts/performance/profile.sh | ||||
|   " | ||||
|   exit 0 | ||||
| fi | ||||
| 
 | ||||
| check_display() { | ||||
|   if system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Display Type"; then | ||||
|     return 0  # Display is present | ||||
|   else | ||||
|     return 1  # Display is not present (headless context) | ||||
|   fi | ||||
| } | ||||
| 
 | ||||
| view_profile_headless() { | ||||
|   # Several of the visualization options require a graphical interface. If DISPLAY is not set, we can't use those options. | ||||
| 
 | ||||
|   extension=".bin" | ||||
|   result_file="${result_file%.*}$extension" | ||||
| 
 | ||||
|   if [[ ! -f "$result_file" ]]; then | ||||
|     unset result_file  # Unset the result_file variable to go back to the "Select a file" view | ||||
|     echo "Result file not found. Please choose a different profile type or go back." | ||||
|   else | ||||
|     while true; do | ||||
|       read -r -p "Choose visualization type: (1) tree (2) summary (3) stats (b) back, (q) quit: " -n 1 visualization_type | ||||
|       echo | ||||
| 
 | ||||
|       if [[ $visualization_type == "b" ]]; then | ||||
|         unset result_file  # Unset the result_file variable to go back to the "Select a file" view | ||||
|         break | ||||
|       elif [[ $visualization_type == "q" ]]; then | ||||
|         exit 0 | ||||
|       fi | ||||
| 
 | ||||
|       case $visualization_type in | ||||
|         "1") | ||||
|           python3 -m memray tree "$result_file" | ||||
|           ;; | ||||
|         "2") | ||||
|           python3 -m memray summary "$result_file" | ||||
|           ;; | ||||
|         "3") | ||||
|           python3 -m memray stats "$result_file" | ||||
|           ;; | ||||
|         *) | ||||
|           echo "Invalid visualization type. Please try again." | ||||
|           ;; | ||||
|       esac | ||||
|     done | ||||
|   fi | ||||
| } | ||||
| 
 | ||||
| view_profile_with_head() { | ||||
|   while true; do | ||||
|     read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type | ||||
|     echo | ||||
| 
 | ||||
|     if [[ $profile_type == "b" ]]; then | ||||
|       unset result_file  # Unset the result_file variable to go back to the "Select a file" view | ||||
|       break | ||||
|     elif [[ $profile_type == "q" ]]; then | ||||
|       exit 0 | ||||
|     fi | ||||
| 
 | ||||
|     if [[ $profile_type == "1" ]]; then | ||||
|       extension=".prof" | ||||
|     elif [[ $profile_type == "2" ]]; then | ||||
|       extension=".bin" | ||||
|     else | ||||
|       echo "Invalid profile type. Please try again." | ||||
|       continue | ||||
|     fi | ||||
| 
 | ||||
|     result_file="${result_file%.*}$extension" | ||||
| 
 | ||||
|     if [[ ! -f "$result_file" ]]; then | ||||
|       echo "Result file not found. Please choose a different profile type or go back." | ||||
|       continue | ||||
|     fi | ||||
| 
 | ||||
|     if [[ $profile_type == "2" ]]; then | ||||
|       while true; do | ||||
|         read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type | ||||
|         echo | ||||
| 
 | ||||
|         if [[ $visualization_type == "b" ]]; then | ||||
|           break | ||||
|         elif [[ $visualization_type == "q" ]]; then | ||||
|           exit 0 | ||||
|         fi | ||||
| 
 | ||||
|         case $visualization_type in | ||||
|           "1") | ||||
|             rm -f "${result_file}.memray.html" | ||||
|             python3 -m memray flamegraph -o "${result_file}.memray.html" "$result_file" | ||||
|             open "${result_file}.memray.html" | ||||
|             ;; | ||||
|           "2") | ||||
|             rm -f "${result_file}.table.html" | ||||
|             python3 -m memray table -o "${result_file}.table.html" "$result_file" | ||||
|             open "${result_file}.table.html" | ||||
|             ;; | ||||
|           "3") | ||||
|             python3 -m memray tree "$result_file" | ||||
|             ;; | ||||
|           "4") | ||||
|             python3 -m memray summary "$result_file" | ||||
|             ;; | ||||
|           "5") | ||||
|             python3 -m memray stats "$result_file" | ||||
|             ;; | ||||
|           *) | ||||
|             echo "Invalid visualization type. Please try again." | ||||
|             ;; | ||||
|         esac | ||||
|       done | ||||
|     else | ||||
|       while true; do | ||||
|         read -r -p "Choose visualization type: (1) flamegraph (2) snakeviz (b) back, (q) quit: " -n 1 visualization_type | ||||
|         echo | ||||
| 
 | ||||
|         if [[ $visualization_type == "b" ]]; then | ||||
|           break | ||||
|         elif [[ $visualization_type == "q" ]]; then | ||||
|           exit 0 | ||||
|         fi | ||||
| 
 | ||||
|         case $visualization_type in | ||||
|           "1") | ||||
|             flameprof_file="${result_file}.flameprof.svg" | ||||
|             rm -f "$flameprof_file" | ||||
|             python3 -m flameprof "$result_file" > "$flameprof_file" | ||||
|             open "$flameprof_file" | ||||
|             ;; | ||||
|           "2") | ||||
|             snakeviz "$result_file" | ||||
|             ;; | ||||
|           *) | ||||
|             echo "Invalid visualization type. Please try again." | ||||
|             ;; | ||||
|         esac | ||||
|       done | ||||
|     fi | ||||
| 
 | ||||
|     break  # Return to the beginning | ||||
|   done | ||||
| } | ||||
| 
 | ||||
| view_profile() { | ||||
| 
 | ||||
|   if [ -n "$1" ]; then | ||||
|     result_file="$1" | ||||
|   fi | ||||
|   while true; do | ||||
|     if [[ -z $result_file ]]; then | ||||
|       echo "Available result files:" | ||||
|       result_files=("$PROFILE_RESULTS_DIR"/*.bin) | ||||
|       if [[ ${#result_files[@]} -eq 0 ]]; then | ||||
|         echo "No result files found." | ||||
|         return | ||||
|       fi | ||||
| 
 | ||||
|       for ((i=0; i<${#result_files[@]}; i++)); do | ||||
|         filename="${result_files[$i]##*/}" | ||||
|         filename="${filename%.*}" | ||||
|         echo "$i. $filename" | ||||
|       done | ||||
| 
 | ||||
|       read -r -p "Enter the number corresponding to the result file you want to view (b to go back, q to quit): " selection | ||||
|       if [[ $selection == "b" ]]; then | ||||
|         return | ||||
|       elif [[ $selection == "q" ]]; then | ||||
|         exit 0 | ||||
|       fi | ||||
| 
 | ||||
|       result_file="${result_files[$selection]}" | ||||
|     fi | ||||
| 
 | ||||
|     if check_display; then | ||||
|       view_profile_with_head "$result_file" | ||||
|     else | ||||
|       view_profile_headless "$result_file" | ||||
|     fi | ||||
|   done | ||||
| } | ||||
| 
 | ||||
| run_profile() { | ||||
|   while true; do | ||||
|     read -r -p "Choose an option: 1) Existing test file, (2) Custom file, (b) back, (q) quit: " -n 1 option | ||||
|     echo | ||||
| 
 | ||||
|     if [[ $option == "b" ]]; then | ||||
|       return | ||||
|     elif [[ $option == "q" ]]; then | ||||
|       exit 0 | ||||
|     fi | ||||
| 
 | ||||
|     if [[ $option == "1" ]]; then | ||||
|       echo "Available test files:" | ||||
|       test_files=("$SCRIPT_DIR/docs"/*) | ||||
|       if [[ ${#test_files[@]} -eq 0 ]]; then | ||||
|         echo "No test files found." | ||||
|         return | ||||
|       fi | ||||
| 
 | ||||
|       for ((i=0; i<${#test_files[@]}; i++)); do | ||||
|         echo "$i. ${test_files[$i]}" | ||||
|       done | ||||
| 
 | ||||
|       read -r -p "Enter the number corresponding to the test file you want to run followed by return (b to go back, q to quit): " selection | ||||
|       if [[ $selection == "b" ]]; then | ||||
|         return | ||||
|       elif [[ $selection == "q" ]]; then | ||||
|         exit 0 | ||||
|       fi | ||||
| 
 | ||||
|       test_file="${test_files[$selection]}" | ||||
|     elif [[ $option == "2" ]]; then | ||||
|       read -r -p "Enter the path to the custom file: " test_file | ||||
|     else | ||||
|       echo "Invalid option. Please try again." | ||||
|       continue | ||||
|     fi | ||||
| 
 | ||||
|     # Delete the output files if they exist | ||||
|     rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" | ||||
|     rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" | ||||
| 
 | ||||
|     # Pick the strategy | ||||
|     while true; do | ||||
|       read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option | ||||
|       echo | ||||
| 
 | ||||
|       if [[ $strategy_option == "b" ]]; then | ||||
|         return | ||||
|       elif [[ $strategy_option == "q" ]]; then | ||||
|         exit 0 | ||||
|       fi | ||||
| 
 | ||||
|       case $strategy_option in | ||||
|         "1") | ||||
|           strategy="auto" | ||||
|           break | ||||
|           ;; | ||||
|         "2") | ||||
|           strategy="fast" | ||||
|           break | ||||
|           ;; | ||||
|         "3") | ||||
|           strategy="hi_res" | ||||
|           break | ||||
|           ;; | ||||
|         *) | ||||
|           echo "Invalid strategy option. Please try again." | ||||
|           ;; | ||||
|       esac | ||||
|     done | ||||
| 
 | ||||
|     echo "Running time profile..." | ||||
|     python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy" | ||||
|     echo "Running memory profile..." | ||||
|     python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy" | ||||
|     echo "Profiling completed." | ||||
|     echo "Viewing results for $test_file" | ||||
|     result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file") | ||||
|     view_profile "${result_file}.bin" # Go directly to view mode | ||||
|   done | ||||
| } | ||||
| 
 | ||||
| while true; do | ||||
|   if [[ -n "$1" ]]; then | ||||
|     mode="$1" | ||||
|   fi | ||||
|    | ||||
|   if [[ -z $result_file ]]; then | ||||
|     read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode | ||||
|     echo | ||||
|   fi | ||||
| 
 | ||||
|   if [[ $mode == "1" ]]; then | ||||
|     run_profile | ||||
|   elif [[ $mode == "2" ]]; then | ||||
|     unset result_file  # Unset the result_file variable before entering the "View" mode | ||||
|     view_profile | ||||
|   elif [[ $mode == "q" ]]; then | ||||
|     exit 0 | ||||
|   else | ||||
|     echo "Invalid mode. Please choose 'view', 'run', or 'quit'." | ||||
|   fi | ||||
| done | ||||
							
								
								
									
										3
									
								
								scripts/performance/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								scripts/performance/requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,3 @@ | ||||
| flameprof>=0.4 | ||||
| memray>=1.7.0 | ||||
| snakeviz>=2.2.0 | ||||
							
								
								
									
										17
									
								
								scripts/performance/run_partition.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								scripts/performance/run_partition.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| import sys | ||||
| 
 | ||||
| from unstructured.partition.auto import partition | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     if len(sys.argv) < 3: | ||||
|         print( | ||||
|             "Please provide the path to the file as the first argument and the strategy as the " | ||||
|             "second argument.", | ||||
|         ) | ||||
|         sys.exit(1) | ||||
| 
 | ||||
|     file_path = sys.argv[1] | ||||
|     strategy = sys.argv[2] | ||||
|     result = partition(file_path, strategy=strategy) | ||||
|     # access element in the return value to make sure we got something back, otherwise error | ||||
|     result[1] | ||||
							
								
								
									
										
											BIN
										
									
								
								scripts/performance/warmup.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								scripts/performance/warmup.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 ryannikolaidis
						ryannikolaidis