unstructured/scripts/performance/profile.sh

#!/bin/bash

# Performance profiling and visualization of code using cProfile and memray.

# Environment Variables:
#   - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)

# Usage: 
# - Run the script and choose the profiling mode: 'run' or 'view'.
# - In the 'run' mode, you can profile custom files or select existing test files.
# - In the 'view' mode, you can view previously generated profiling results.
# - The script supports time profiling with cProfile and memory profiling with memray.
# - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.
# - Test documents are (optionally) synced from an S3 bucket to a local directory before running the profiles.

# Dependencies:
# - memray package for memory profiling and visualization.
# - flameprof and snakeviz for time profiling and visualization.
# - AWS CLI for syncing files from S3 (if applicable).

# Package dependencies can be installed with `pip install -r scripts/performance/requirements.txt`

# Usage example:
# ./scripts/performance/profile.sh

# NOTE: because memray does not build wheels for ARM-Linux, this script can not run in an ARM Docker container on an M1 Mac (though emulated AMD would work).


# Validate dependencies
check_python_module() {
  if ! python3 -c "import $1" >/dev/null 2>&1; then
    echo "Error: Python module $1 is not installed. Please install required depencies with 'pip install -r scripts/performance/requirements.txt'."
    exit 1
  fi
}
validate_dependencies() {
  check_python_module memray 
  check_python_module flameprof
}

# only validate in non-docker context (since we install dependencies on the fly in docker)
if [[ "$DOCKER_TEST" != "true" ]]; then
  validate_dependencies
fi

SCRIPT_DIR=$(dirname "$0")
# Convert the relative path to module notation
MODULE_PATH=${SCRIPT_DIR////.}
# Remove the leading dot if it exists
MODULE_PATH=${MODULE_PATH#.}
# Remove the leading dot if it exists again
MODULE_PATH=${MODULE_PATH#\.}

PROFILE_RESULTS_DIR="$SCRIPT_DIR/profile_results"

# Create PROFILE_RESULTS_DIR if it doesn't exist
mkdir -p "$PROFILE_RESULTS_DIR"

if [[ "$DOCKER_TEST" == "true" ]]; then
  SCRIPT_PARENT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
  docker run -it --rm -v "$SCRIPT_PARENT_DIR:/home/unstructured/scripts" unstructured:dev /bin/bash -c "
  cd unstructured/
  pip install -r scripts/performance/requirements.txt
  echo \"Warming the Docker container by running a small partitioning job..\"
  python3 -c 'from unstructured.partition.auto import partition; partition(\"'""$SCRIPT_DIR/warmup_docs/warmup.pdf'\", strategy=\"hi_res\")[1]'
  ./scripts/performance/profile.sh
  "
  exit 0
fi

check_display() {
  if system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Display Type"; then
    return 0  # Display is present
  else
    return 1  # Display is not present (headless context)
  fi
}

view_profile_headless() {
  # Several of the visualization options require a graphical interface. If DISPLAY is not set, we can't use those options.

  extension=".bin"
  result_file="${result_file%.*}$extension"

  if [[ ! -f "$result_file" ]]; then
    unset result_file  # Unset the result_file variable to go back to the "Select a file" view
    echo "Result file not found. Please choose a different profile type or go back."
  else
    while true; do
      read -r -p "Choose visualization type: (1) tree (2) summary (3) stats (b) back, (q) quit: " -n 1 visualization_type
      echo

      if [[ $visualization_type == "b" ]]; then
        unset result_file  # Unset the result_file variable to go back to the "Select a file" view
        break
      elif [[ $visualization_type == "q" ]]; then
        exit 0
      fi

      case $visualization_type in
        "1")
          python3 -m memray tree "$result_file"
          ;;
        "2")
          python3 -m memray summary "$result_file"
          ;;
        "3")
          python3 -m memray stats "$result_file"
          ;;
        *)
          echo "Invalid visualization type. Please try again."
          ;;
      esac
    done
  fi
}

view_profile_with_head() {
  while true; do
    read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type
    echo

    if [[ $profile_type == "b" ]]; then
      unset result_file  # Unset the result_file variable to go back to the "Select a file" view
      break
    elif [[ $profile_type == "q" ]]; then
      exit 0
    fi

    if [[ $profile_type == "1" ]]; then
      extension=".prof"
    elif [[ $profile_type == "2" ]]; then
      extension=".bin"
    else
      echo "Invalid profile type. Please try again."
      continue
    fi

    result_file="${result_file%.*}$extension"

    if [[ ! -f "$result_file" ]]; then
      echo "Result file not found. Please choose a different profile type or go back."
      continue
    fi

    if [[ $profile_type == "2" ]]; then
      while true; do
        read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type
        echo

        if [[ $visualization_type == "b" ]]; then
          break
        elif [[ $visualization_type == "q" ]]; then
          exit 0
        fi

        case $visualization_type in
          "1")
            rm -f "${result_file}.memray.html"
            python3 -m memray flamegraph -o "${result_file}.memray.html" "$result_file"
            open "${result_file}.memray.html"
            ;;
          "2")
            rm -f "${result_file}.table.html"
            python3 -m memray table -o "${result_file}.table.html" "$result_file"
            open "${result_file}.table.html"
            ;;
          "3")
            python3 -m memray tree "$result_file"
            ;;
          "4")
            python3 -m memray summary "$result_file"
            ;;
          "5")
            python3 -m memray stats "$result_file"
            ;;
          *)
            echo "Invalid visualization type. Please try again."
            ;;
        esac
      done
    else
      while true; do
        read -r -p "Choose visualization type: (1) flamegraph (2) snakeviz (b) back, (q) quit: " -n 1 visualization_type
        echo

        if [[ $visualization_type == "b" ]]; then
          break
        elif [[ $visualization_type == "q" ]]; then
          exit 0
        fi

        case $visualization_type in
          "1")
            flameprof_file="${result_file}.flameprof.svg"
            rm -f "$flameprof_file"
            python3 -m flameprof "$result_file" > "$flameprof_file"
            open "$flameprof_file"
            ;;
          "2")
            snakeviz "$result_file"
            ;;
          *)
            echo "Invalid visualization type. Please try again."
            ;;
        esac
      done
    fi

    break  # Return to the beginning
  done
}

view_profile() {

  if [ -n "$1" ]; then
    result_file="$1"
  fi
  while true; do
    if [[ -z $result_file ]]; then
      echo "Available result files:"
      result_files=("$PROFILE_RESULTS_DIR"/*.bin)
      if [[ ${#result_files[@]} -eq 0 ]]; then
        echo "No result files found."
        return
      fi

      for ((i=0; i<${#result_files[@]}; i++)); do
        filename="${result_files[$i]##*/}"
        filename="${filename%.*}"
        echo "$i. $filename"
      done

      read -r -p "Enter the number corresponding to the result file you want to view (b to go back, q to quit): " selection
      if [[ $selection == "b" ]]; then
        return
      elif [[ $selection == "q" ]]; then
        exit 0
      fi

      result_file="${result_files[$selection]}"
    fi

    if check_display; then
      view_profile_with_head "$result_file"
    else
      view_profile_headless "$result_file"
    fi
  done
}

run_profile() {
  while true; do
    read -r -p "Choose an option: 1) Existing test file, (2) Custom file, (b) back, (q) quit: " -n 1 option
    echo

    if [[ $option == "b" ]]; then
      return
    elif [[ $option == "q" ]]; then
      exit 0
    fi

    if [[ $option == "1" ]]; then
      echo "Available test files:"
      test_files=("$SCRIPT_DIR/docs"/*)
      if [[ ${#test_files[@]} -eq 0 ]]; then
        echo "No test files found."
        return
      fi

      for ((i=0; i<${#test_files[@]}; i++)); do
        echo "$i. ${test_files[$i]}"
      done

      read -r -p "Enter the number corresponding to the test file you want to run followed by return (b to go back, q to quit): " selection
      if [[ $selection == "b" ]]; then
        return
      elif [[ $selection == "q" ]]; then
        exit 0
      fi

      test_file="${test_files[$selection]}"
    elif [[ $option == "2" ]]; then
      read -r -p "Enter the path to the custom file: " test_file
    else
      echo "Invalid option. Please try again."
      continue
    fi

    # Delete the output files if they exist
    rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.prof"
    rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.bin"

    # Pick the strategy
    while true; do
      read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option
      echo

      if [[ $strategy_option == "b" ]]; then
        return
      elif [[ $strategy_option == "q" ]]; then
        exit 0
      fi

      case $strategy_option in
        "1")
          strategy="auto"
          break
          ;;
        "2")
          strategy="fast"
          break
          ;;
        "3")
          strategy="hi_res"
          break
          ;;
        *)
          echo "Invalid strategy option. Please try again."
          ;;
      esac
    done

    echo "Running time profile..."
    python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
    echo "Running memory profile..."
    python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
    echo "Profiling completed."
    echo "Viewing results for $test_file"
    result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file")
    view_profile "${result_file}.bin" # Go directly to view mode
  done
}

while true; do
  if [[ -n "$1" ]]; then
    mode="$1"
  fi
  
  if [[ -z $result_file ]]; then
    read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode
    echo
  fi

  if [[ $mode == "1" ]]; then
    run_profile
  elif [[ $mode == "2" ]]; then
    unset result_file  # Unset the result_file variable before entering the "View" mode
    view_profile
  elif [[ $mode == "q" ]]; then
    exit 0
  else
    echo "Invalid mode. Please choose 'view', 'run', or 'quit'."
  fi
done
test: adds profiling script (#661) 2023-06-01 14:26:05 -07:00			`#!/bin/bash`

			`# Performance profiling and visualization of code using cProfile and memray.`

			`# Environment Variables:`
			`# - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)`

			`# Usage:`
			`# - Run the script and choose the profiling mode: 'run' or 'view'.`
			`# - In the 'run' mode, you can profile custom files or select existing test files.`
			`# - In the 'view' mode, you can view previously generated profiling results.`
			`# - The script supports time profiling with cProfile and memory profiling with memray.`
			`# - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.`
			`# - Test documents are (optionally) synced from an S3 bucket to a local directory before running the profiles.`

			`# Dependencies:`
			`# - memray package for memory profiling and visualization.`
			`# - flameprof and snakeviz for time profiling and visualization.`
			`# - AWS CLI for syncing files from S3 (if applicable).`

			# Package dependencies can be installed with `pip install -r scripts/performance/requirements.txt`

			`# Usage example:`
			`# ./scripts/performance/profile.sh`

			`# NOTE: because memray does not build wheels for ARM-Linux, this script can not run in an ARM Docker container on an M1 Mac (though emulated AMD would work).`


			`# Validate dependencies`
			`check_python_module() {`
			`if ! python3 -c "import $1" >/dev/null 2>&1; then`
			`echo "Error: Python module $1 is not installed. Please install required depencies with 'pip install -r scripts/performance/requirements.txt'."`
			`exit 1`
			`fi`
			`}`
			`validate_dependencies() {`
			`check_python_module memray`
			`check_python_module flameprof`
			`}`

			`# only validate in non-docker context (since we install dependencies on the fly in docker)`
			`if [[ "$DOCKER_TEST" != "true" ]]; then`
			`validate_dependencies`
			`fi`

			`SCRIPT_DIR=$(dirname "$0")`
			`# Convert the relative path to module notation`
			`MODULE_PATH=${SCRIPT_DIR////.}`
			`# Remove the leading dot if it exists`
			`MODULE_PATH=${MODULE_PATH#.}`
			`# Remove the leading dot if it exists again`
			`MODULE_PATH=${MODULE_PATH#\.}`

			`PROFILE_RESULTS_DIR="$SCRIPT_DIR/profile_results"`

			`# Create PROFILE_RESULTS_DIR if it doesn't exist`
			`mkdir -p "$PROFILE_RESULTS_DIR"`

			`if [[ "$DOCKER_TEST" == "true" ]]; then`
			`SCRIPT_PARENT_DIR=$(dirname "$(dirname "$(realpath "$0")")")`
			`docker run -it --rm -v "$SCRIPT_PARENT_DIR:/home/unstructured/scripts" unstructured:dev /bin/bash -c "`
			`cd unstructured/`
			`pip install -r scripts/performance/requirements.txt`
			`echo \"Warming the Docker container by running a small partitioning job..\"`
test: add benchmark script (#638) 2023-06-05 09:14:43 -07:00			`python3 -c 'from unstructured.partition.auto import partition; partition(\"'""$SCRIPT_DIR/warmup_docs/warmup.pdf'\", strategy=\"hi_res\")[1]'`
test: adds profiling script (#661) 2023-06-01 14:26:05 -07:00			`./scripts/performance/profile.sh`
			`"`
			`exit 0`
			`fi`

			`check_display() {`
			`if system_profiler SPDisplaysDataType 2>/dev/null \| grep -q "Display Type"; then`
			`return 0 # Display is present`
			`else`
			`return 1 # Display is not present (headless context)`
			`fi`
			`}`

			`view_profile_headless() {`
			`# Several of the visualization options require a graphical interface. If DISPLAY is not set, we can't use those options.`

			`extension=".bin"`
			`result_file="${result_file%.*}$extension"`

			`if [[ ! -f "$result_file" ]]; then`
			`unset result_file # Unset the result_file variable to go back to the "Select a file" view`
			`echo "Result file not found. Please choose a different profile type or go back."`
			`else`
			`while true; do`
			`read -r -p "Choose visualization type: (1) tree (2) summary (3) stats (b) back, (q) quit: " -n 1 visualization_type`
			`echo`

			`if [[ $visualization_type == "b" ]]; then`
			`unset result_file # Unset the result_file variable to go back to the "Select a file" view`
			`break`
			`elif [[ $visualization_type == "q" ]]; then`
			`exit 0`
			`fi`

			`case $visualization_type in`
			`"1")`
			`python3 -m memray tree "$result_file"`
			`;;`
			`"2")`
			`python3 -m memray summary "$result_file"`
			`;;`
			`"3")`
			`python3 -m memray stats "$result_file"`
			`;;`
			`*)`
			`echo "Invalid visualization type. Please try again."`
			`;;`
			`esac`
			`done`
			`fi`
			`}`

			`view_profile_with_head() {`
			`while true; do`
			`read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type`
			`echo`

			`if [[ $profile_type == "b" ]]; then`
			`unset result_file # Unset the result_file variable to go back to the "Select a file" view`
			`break`
			`elif [[ $profile_type == "q" ]]; then`
			`exit 0`
			`fi`

			`if [[ $profile_type == "1" ]]; then`
			`extension=".prof"`
			`elif [[ $profile_type == "2" ]]; then`
			`extension=".bin"`
			`else`
			`echo "Invalid profile type. Please try again."`
			`continue`
			`fi`

			`result_file="${result_file%.*}$extension"`

			`if [[ ! -f "$result_file" ]]; then`
			`echo "Result file not found. Please choose a different profile type or go back."`
			`continue`
			`fi`

			`if [[ $profile_type == "2" ]]; then`
			`while true; do`
			`read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type`
			`echo`

			`if [[ $visualization_type == "b" ]]; then`
			`break`
			`elif [[ $visualization_type == "q" ]]; then`
			`exit 0`
			`fi`

			`case $visualization_type in`
			`"1")`
			`rm -f "${result_file}.memray.html"`
			`python3 -m memray flamegraph -o "${result_file}.memray.html" "$result_file"`
			`open "${result_file}.memray.html"`
			`;;`
			`"2")`
			`rm -f "${result_file}.table.html"`
			`python3 -m memray table -o "${result_file}.table.html" "$result_file"`
			`open "${result_file}.table.html"`
			`;;`
			`"3")`
			`python3 -m memray tree "$result_file"`
			`;;`
			`"4")`
			`python3 -m memray summary "$result_file"`
			`;;`
			`"5")`
			`python3 -m memray stats "$result_file"`
			`;;`
			`*)`
			`echo "Invalid visualization type. Please try again."`
			`;;`
			`esac`
			`done`
			`else`
			`while true; do`
			`read -r -p "Choose visualization type: (1) flamegraph (2) snakeviz (b) back, (q) quit: " -n 1 visualization_type`
			`echo`

			`if [[ $visualization_type == "b" ]]; then`
			`break`
			`elif [[ $visualization_type == "q" ]]; then`
			`exit 0`
			`fi`

			`case $visualization_type in`
			`"1")`
			`flameprof_file="${result_file}.flameprof.svg"`
			`rm -f "$flameprof_file"`
			`python3 -m flameprof "$result_file" > "$flameprof_file"`
			`open "$flameprof_file"`
			`;;`
			`"2")`
			`snakeviz "$result_file"`
			`;;`
			`*)`
			`echo "Invalid visualization type. Please try again."`
			`;;`
			`esac`
			`done`
			`fi`

			`break # Return to the beginning`
			`done`
			`}`

			`view_profile() {`

			`if [ -n "$1" ]; then`
			`result_file="$1"`
			`fi`
			`while true; do`
			`if [[ -z $result_file ]]; then`
			`echo "Available result files:"`
			`result_files=("$PROFILE_RESULTS_DIR"/*.bin)`
			`if [[ ${#result_files[@]} -eq 0 ]]; then`
			`echo "No result files found."`
			`return`
			`fi`

			`for ((i=0; i<${#result_files[@]}; i++)); do`
			`filename="${result_files[$i]##*/}"`
			`filename="${filename%.*}"`
			`echo "$i. $filename"`
			`done`

			`read -r -p "Enter the number corresponding to the result file you want to view (b to go back, q to quit): " selection`
			`if [[ $selection == "b" ]]; then`
			`return`
			`elif [[ $selection == "q" ]]; then`
			`exit 0`
			`fi`

			`result_file="${result_files[$selection]}"`
			`fi`

			`if check_display; then`
			`view_profile_with_head "$result_file"`
			`else`
			`view_profile_headless "$result_file"`
			`fi`
			`done`
			`}`

			`run_profile() {`
			`while true; do`
			`read -r -p "Choose an option: 1) Existing test file, (2) Custom file, (b) back, (q) quit: " -n 1 option`
			`echo`

			`if [[ $option == "b" ]]; then`
			`return`
			`elif [[ $option == "q" ]]; then`
			`exit 0`
			`fi`

			`if [[ $option == "1" ]]; then`
			`echo "Available test files:"`
			`test_files=("$SCRIPT_DIR/docs"/*)`
			`if [[ ${#test_files[@]} -eq 0 ]]; then`
			`echo "No test files found."`
			`return`
			`fi`

			`for ((i=0; i<${#test_files[@]}; i++)); do`
			`echo "$i. ${test_files[$i]}"`
			`done`

			`read -r -p "Enter the number corresponding to the test file you want to run followed by return (b to go back, q to quit): " selection`
			`if [[ $selection == "b" ]]; then`
			`return`
			`elif [[ $selection == "q" ]]; then`
			`exit 0`
			`fi`

			`test_file="${test_files[$selection]}"`
			`elif [[ $option == "2" ]]; then`
			`read -r -p "Enter the path to the custom file: " test_file`
			`else`
			`echo "Invalid option. Please try again."`
			`continue`
			`fi`

			`# Delete the output files if they exist`
			`rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.prof"`
			`rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.bin"`

			`# Pick the strategy`
			`while true; do`
			`read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option`
			`echo`

			`if [[ $strategy_option == "b" ]]; then`
			`return`
			`elif [[ $strategy_option == "q" ]]; then`
			`exit 0`
			`fi`

			`case $strategy_option in`
			`"1")`
			`strategy="auto"`
			`break`
			`;;`
			`"2")`
			`strategy="fast"`
			`break`
			`;;`
			`"3")`
			`strategy="hi_res"`
			`break`
			`;;`
			`*)`
			`echo "Invalid strategy option. Please try again."`
			`;;`
			`esac`
			`done`

			`echo "Running time profile..."`
			`python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"`
			`echo "Running memory profile..."`
			`python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"`
			`echo "Profiling completed."`
			`echo "Viewing results for $test_file"`
			`result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file")`
			`view_profile "${result_file}.bin" # Go directly to view mode`
			`done`
			`}`

			`while true; do`
			`if [[ -n "$1" ]]; then`
			`mode="$1"`
			`fi`

			`if [[ -z $result_file ]]; then`
			`read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode`
			`echo`
			`fi`

			`if [[ $mode == "1" ]]; then`
			`run_profile`
			`elif [[ $mode == "2" ]]; then`
			`unset result_file # Unset the result_file variable before entering the "View" mode`
			`view_profile`
			`elif [[ $mode == "q" ]]; then`
			`exit 0`
			`else`
			`echo "Invalid mode. Please choose 'view', 'run', or 'quit'."`
			`fi`
			`done`