test: adds profiling script (#661)

2025-12-04 19:16:03 +00:00 · 2023-06-01 14:26:05 -07:00 · 2023-06-01 14:26:05 -07:00 · bdef4fd398
commit bdef4fd398
parent c35fff2972
7 changed files with 427 additions and 2 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -31,11 +31,10 @@ jobs:
      with:
        python-version: ${{ matrix.python-version }}
    - name: Setup virtual environment (no cache hit)
-      if: steps.virtualenv-cache.outputs.cache-hit != 'true'
      run: |
        python${{ matrix.python-version }} -m venv .venv
        source .venv/bin/activate
-        mkdir "$NLTK_DATA"
+        [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
        make install-ci

  check-deps:
--- a/scripts/performance/.gitignore
+++ b/scripts/performance/.gitignore
@ -0,0 +1,3 @@
+docs
+benchmark_results
+profile_results
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@ -0,0 +1,39 @@
+# Performance
+This is a collection of tools helpful for inspecting and tracking performance of the Unstructured library. 
+
+The benchmarking script allows a user to track performance time to partitioning results against a fixed set of test documents and store those results with indication of architecture, instance type, and git hash, in S3.
+
+The profiling script allows a user to inspect how time time and memory are spent across called functions when performing partitioning on a given document.
+
+## Install
+Benchmarking requires no additional dependencies and should work without any initial setup.
+Profiling has a few dependencies which can be installed with: 
+`pip install -r scripts/performance/requirements.txt`
+
+Before running a test either populate the `docs` directory with test documents of interest or set environment variable SYNC_S3_DOCS=true
+
+## Run
+### Benchmark
+Export / assign desired environment variable settings:
+- DOCKER_TEST: Set to true to run benchmark inside a Docker container (default: false)
+- NUM_ITERATIONS: Number of iterations for benchmark (e.g., 100) (default: 3)
+- INSTANCE_TYPE: Type of benchmark instance (e.g., "c5.xlarge") (default: unspecified)
+- PUBLISH_RESULTS: Set to true to publish results to S3 bucket (default: false)
+- SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
+- 
+Usage: `./scripts/performance/benchmark.sh`
+
+### Profile
+
+Export / assign desired environment variable settings:
+- SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
+- DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)
+
+Usage: `./scripts/performance/profile.sh`
+- Run the script and choose the profiling mode: 'run' or 'view'.
+- In the 'run' mode, you can profile custom files or select existing test files.
+- In the 'view' mode, you can view previously generated profiling results.
+- The script supports time profiling with cProfile and memory profiling with memray.
+- Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.
+- Test documents are synced from an S3 bucket to a local directory before running the profiles
+
--- a/scripts/performance/profile.sh
+++ b/scripts/performance/profile.sh
@ -0,0 +1,364 @@
+#!/bin/bash
+
+# Performance profiling and visualization of code using cProfile and memray.
+
+# Environment Variables:
+#   - SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
+#   - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)
+
+# Usage: 
+# - Run the script and choose the profiling mode: 'run' or 'view'.
+# - In the 'run' mode, you can profile custom files or select existing test files.
+# - In the 'view' mode, you can view previously generated profiling results.
+# - The script supports time profiling with cProfile and memory profiling with memray.
+# - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.
+# - Test documents are (optionally) synced from an S3 bucket to a local directory before running the profiles.
+
+# Dependencies:
+# - memray package for memory profiling and visualization.
+# - flameprof and snakeviz for time profiling and visualization.
+# - AWS CLI for syncing files from S3 (if applicable).
+
+# Package dependencies can be installed with `pip install -r scripts/performance/requirements.txt`
+
+# Usage example:
+# ./scripts/performance/profile.sh
+
+# NOTE: because memray does not build wheels for ARM-Linux, this script can not run in an ARM Docker container on an M1 Mac (though emulated AMD would work).
+
+
+# Validate dependencies
+check_python_module() {
+  if ! python3 -c "import $1" >/dev/null 2>&1; then
+    echo "Error: Python module $1 is not installed. Please install required depencies with 'pip install -r scripts/performance/requirements.txt'."
+    exit 1
+  fi
+}
+validate_dependencies() {
+  check_python_module memray 
+  check_python_module flameprof
+}
+
+# only validate in non-docker context (since we install dependencies on the fly in docker)
+if [[ "$DOCKER_TEST" != "true" ]]; then
+  validate_dependencies
+fi
+
+SCRIPT_DIR=$(dirname "$0")
+# Convert the relative path to module notation
+MODULE_PATH=${SCRIPT_DIR////.}
+# Remove the leading dot if it exists
+MODULE_PATH=${MODULE_PATH#.}
+# Remove the leading dot if it exists again
+MODULE_PATH=${MODULE_PATH#\.}
+
+PROFILE_RESULTS_DIR="$SCRIPT_DIR/profile_results"
+
+S3_BUCKET="utic-dev-tech-fixtures"
+S3_DOCS_DIR="performance-test/docs"
+
+# Create PROFILE_RESULTS_DIR if it doesn't exist
+mkdir -p "$PROFILE_RESULTS_DIR"
+
+if [[ "$SYNC_S3_DOCS" == "true" ]]; then
+  # Sync files from S3 to the local "docs" directory
+  aws s3 sync "s3://$S3_BUCKET/$S3_DOCS_DIR" "$SCRIPT_DIR/docs"
+fi
+
+if [[ "$DOCKER_TEST" == "true" ]]; then
+  SCRIPT_PARENT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
+  docker run -it --rm -v "$SCRIPT_PARENT_DIR:/home/unstructured/scripts" unstructured:dev /bin/bash -c "
+  cd unstructured/
+  pip install -r scripts/performance/requirements.txt
+  echo \"Warming the Docker container by running a small partitioning job..\"
+  python3 -c 'from unstructured.partition.auto import partition; partition(\"'""$SCRIPT_DIR/warmup.pdf'\", strategy=\"hi_res\")[1]'
+  ./scripts/performance/profile.sh
+  "
+  exit 0
+fi
+
+check_display() {
+  if system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Display Type"; then
+    return 0  # Display is present
+  else
+    return 1  # Display is not present (headless context)
+  fi
+}
+
+view_profile_headless() {
+  # Several of the visualization options require a graphical interface. If DISPLAY is not set, we can't use those options.
+
+  extension=".bin"
+  result_file="${result_file%.*}$extension"
+
+  if [[ ! -f "$result_file" ]]; then
+    unset result_file  # Unset the result_file variable to go back to the "Select a file" view
+    echo "Result file not found. Please choose a different profile type or go back."
+  else
+    while true; do
+      read -r -p "Choose visualization type: (1) tree (2) summary (3) stats (b) back, (q) quit: " -n 1 visualization_type
+      echo
+
+      if [[ $visualization_type == "b" ]]; then
+        unset result_file  # Unset the result_file variable to go back to the "Select a file" view
+        break
+      elif [[ $visualization_type == "q" ]]; then
+        exit 0
+      fi
+
+      case $visualization_type in
+        "1")
+          python3 -m memray tree "$result_file"
+          ;;
+        "2")
+          python3 -m memray summary "$result_file"
+          ;;
+        "3")
+          python3 -m memray stats "$result_file"
+          ;;
+        *)
+          echo "Invalid visualization type. Please try again."
+          ;;
+      esac
+    done
+  fi
+}
+
+view_profile_with_head() {
+  while true; do
+    read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type
+    echo
+
+    if [[ $profile_type == "b" ]]; then
+      unset result_file  # Unset the result_file variable to go back to the "Select a file" view
+      break
+    elif [[ $profile_type == "q" ]]; then
+      exit 0
+    fi
+
+    if [[ $profile_type == "1" ]]; then
+      extension=".prof"
+    elif [[ $profile_type == "2" ]]; then
+      extension=".bin"
+    else
+      echo "Invalid profile type. Please try again."
+      continue
+    fi
+
+    result_file="${result_file%.*}$extension"
+
+    if [[ ! -f "$result_file" ]]; then
+      echo "Result file not found. Please choose a different profile type or go back."
+      continue
+    fi
+
+    if [[ $profile_type == "2" ]]; then
+      while true; do
+        read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type
+        echo
+
+        if [[ $visualization_type == "b" ]]; then
+          break
+        elif [[ $visualization_type == "q" ]]; then
+          exit 0
+        fi
+
+        case $visualization_type in
+          "1")
+            rm -f "${result_file}.memray.html"
+            python3 -m memray flamegraph -o "${result_file}.memray.html" "$result_file"
+            open "${result_file}.memray.html"
+            ;;
+          "2")
+            rm -f "${result_file}.table.html"
+            python3 -m memray table -o "${result_file}.table.html" "$result_file"
+            open "${result_file}.table.html"
+            ;;
+          "3")
+            python3 -m memray tree "$result_file"
+            ;;
+          "4")
+            python3 -m memray summary "$result_file"
+            ;;
+          "5")
+            python3 -m memray stats "$result_file"
+            ;;
+          *)
+            echo "Invalid visualization type. Please try again."
+            ;;
+        esac
+      done
+    else
+      while true; do
+        read -r -p "Choose visualization type: (1) flamegraph (2) snakeviz (b) back, (q) quit: " -n 1 visualization_type
+        echo
+
+        if [[ $visualization_type == "b" ]]; then
+          break
+        elif [[ $visualization_type == "q" ]]; then
+          exit 0
+        fi
+
+        case $visualization_type in
+          "1")
+            flameprof_file="${result_file}.flameprof.svg"
+            rm -f "$flameprof_file"
+            python3 -m flameprof "$result_file" > "$flameprof_file"
+            open "$flameprof_file"
+            ;;
+          "2")
+            snakeviz "$result_file"
+            ;;
+          *)
+            echo "Invalid visualization type. Please try again."
+            ;;
+        esac
+      done
+    fi
+
+    break  # Return to the beginning
+  done
+}
+
+view_profile() {
+
+  if [ -n "$1" ]; then
+    result_file="$1"
+  fi
+  while true; do
+    if [[ -z $result_file ]]; then
+      echo "Available result files:"
+      result_files=("$PROFILE_RESULTS_DIR"/*.bin)
+      if [[ ${#result_files[@]} -eq 0 ]]; then
+        echo "No result files found."
+        return
+      fi
+
+      for ((i=0; i<${#result_files[@]}; i++)); do
+        filename="${result_files[$i]##*/}"
+        filename="${filename%.*}"
+        echo "$i. $filename"
+      done
+
+      read -r -p "Enter the number corresponding to the result file you want to view (b to go back, q to quit): " selection
+      if [[ $selection == "b" ]]; then
+        return
+      elif [[ $selection == "q" ]]; then
+        exit 0
+      fi
+
+      result_file="${result_files[$selection]}"
+    fi
+
+    if check_display; then
+      view_profile_with_head "$result_file"
+    else
+      view_profile_headless "$result_file"
+    fi
+  done
+}
+
+run_profile() {
+  while true; do
+    read -r -p "Choose an option: 1) Existing test file, (2) Custom file, (b) back, (q) quit: " -n 1 option
+    echo
+
+    if [[ $option == "b" ]]; then
+      return
+    elif [[ $option == "q" ]]; then
+      exit 0
+    fi
+
+    if [[ $option == "1" ]]; then
+      echo "Available test files:"
+      test_files=("$SCRIPT_DIR/docs"/*)
+      if [[ ${#test_files[@]} -eq 0 ]]; then
+        echo "No test files found."
+        return
+      fi
+
+      for ((i=0; i<${#test_files[@]}; i++)); do
+        echo "$i. ${test_files[$i]}"
+      done
+
+      read -r -p "Enter the number corresponding to the test file you want to run followed by return (b to go back, q to quit): " selection
+      if [[ $selection == "b" ]]; then
+        return
+      elif [[ $selection == "q" ]]; then
+        exit 0
+      fi
+
+      test_file="${test_files[$selection]}"
+    elif [[ $option == "2" ]]; then
+      read -r -p "Enter the path to the custom file: " test_file
+    else
+      echo "Invalid option. Please try again."
+      continue
+    fi
+
+    # Delete the output files if they exist
+    rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.prof"
+    rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.bin"
+
+    # Pick the strategy
+    while true; do
+      read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option
+      echo
+
+      if [[ $strategy_option == "b" ]]; then
+        return
+      elif [[ $strategy_option == "q" ]]; then
+        exit 0
+      fi
+
+      case $strategy_option in
+        "1")
+          strategy="auto"
+          break
+          ;;
+        "2")
+          strategy="fast"
+          break
+          ;;
+        "3")
+          strategy="hi_res"
+          break
+          ;;
+        *)
+          echo "Invalid strategy option. Please try again."
+          ;;
+      esac
+    done
+
+    echo "Running time profile..."
+    python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
+    echo "Running memory profile..."
+    python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
+    echo "Profiling completed."
+    echo "Viewing results for $test_file"
+    result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file")
+    view_profile "${result_file}.bin" # Go directly to view mode
+  done
+}
+
+while true; do
+  if [[ -n "$1" ]]; then
+    mode="$1"
+  fi
+  
+  if [[ -z $result_file ]]; then
+    read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode
+    echo
+  fi
+
+  if [[ $mode == "1" ]]; then
+    run_profile
+  elif [[ $mode == "2" ]]; then
+    unset result_file  # Unset the result_file variable before entering the "View" mode
+    view_profile
+  elif [[ $mode == "q" ]]; then
+    exit 0
+  else
+    echo "Invalid mode. Please choose 'view', 'run', or 'quit'."
+  fi
+done
--- a/scripts/performance/requirements.txt
+++ b/scripts/performance/requirements.txt
@ -0,0 +1,3 @@
+flameprof>=0.4
+memray>=1.7.0
+snakeviz>=2.2.0
--- a/scripts/performance/run_partition.py
+++ b/scripts/performance/run_partition.py
@ -0,0 +1,17 @@
+import sys
+
+from unstructured.partition.auto import partition
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Please provide the path to the file as the first argument and the strategy as the "
+            "second argument.",
+        )
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    strategy = sys.argv[2]
+    result = partition(file_path, strategy=strategy)
+    # access element in the return value to make sure we got something back, otherwise error
+    result[1]
--- a/scripts/performance/warmup.pdf
+++ b/scripts/performance/warmup.pdf