mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
test: adds profiling script (#661)
This commit is contained in:
parent
c35fff2972
commit
bdef4fd398
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -31,11 +31,10 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Setup virtual environment (no cache hit)
|
||||
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python${{ matrix.python-version }} -m venv .venv
|
||||
source .venv/bin/activate
|
||||
mkdir "$NLTK_DATA"
|
||||
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
|
||||
make install-ci
|
||||
|
||||
check-deps:
|
||||
|
3
scripts/performance/.gitignore
vendored
Normal file
3
scripts/performance/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
docs
|
||||
benchmark_results
|
||||
profile_results
|
39
scripts/performance/README.md
Normal file
39
scripts/performance/README.md
Normal file
@ -0,0 +1,39 @@
|
||||
# Performance
|
||||
This is a collection of tools helpful for inspecting and tracking performance of the Unstructured library.
|
||||
|
||||
The benchmarking script allows a user to track performance time to partitioning results against a fixed set of test documents and store those results with indication of architecture, instance type, and git hash, in S3.
|
||||
|
||||
The profiling script allows a user to inspect how time time and memory are spent across called functions when performing partitioning on a given document.
|
||||
|
||||
## Install
|
||||
Benchmarking requires no additional dependencies and should work without any initial setup.
|
||||
Profiling has a few dependencies which can be installed with:
|
||||
`pip install -r scripts/performance/requirements.txt`
|
||||
|
||||
Before running a test either populate the `docs` directory with test documents of interest or set environment variable SYNC_S3_DOCS=true
|
||||
|
||||
## Run
|
||||
### Benchmark
|
||||
Export / assign desired environment variable settings:
|
||||
- DOCKER_TEST: Set to true to run benchmark inside a Docker container (default: false)
|
||||
- NUM_ITERATIONS: Number of iterations for benchmark (e.g., 100) (default: 3)
|
||||
- INSTANCE_TYPE: Type of benchmark instance (e.g., "c5.xlarge") (default: unspecified)
|
||||
- PUBLISH_RESULTS: Set to true to publish results to S3 bucket (default: false)
|
||||
- SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
|
||||
-
|
||||
Usage: `./scripts/performance/benchmark.sh`
|
||||
|
||||
### Profile
|
||||
|
||||
Export / assign desired environment variable settings:
|
||||
- SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
|
||||
- DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)
|
||||
|
||||
Usage: `./scripts/performance/profile.sh`
|
||||
- Run the script and choose the profiling mode: 'run' or 'view'.
|
||||
- In the 'run' mode, you can profile custom files or select existing test files.
|
||||
- In the 'view' mode, you can view previously generated profiling results.
|
||||
- The script supports time profiling with cProfile and memory profiling with memray.
|
||||
- Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.
|
||||
- Test documents are synced from an S3 bucket to a local directory before running the profiles
|
||||
|
364
scripts/performance/profile.sh
Executable file
364
scripts/performance/profile.sh
Executable file
@ -0,0 +1,364 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Performance profiling and visualization of code using cProfile and memray.
|
||||
|
||||
# Environment Variables:
|
||||
# - SYNC_S3_DOCS: Set to true to sync test documents from S3 (default: false)
|
||||
# - DOCKER_TEST: Set to true to run profiling inside a Docker container (default: false)
|
||||
|
||||
# Usage:
|
||||
# - Run the script and choose the profiling mode: 'run' or 'view'.
|
||||
# - In the 'run' mode, you can profile custom files or select existing test files.
|
||||
# - In the 'view' mode, you can view previously generated profiling results.
|
||||
# - The script supports time profiling with cProfile and memory profiling with memray.
|
||||
# - Users can choose different visualization options such as flamegraphs, tables, trees, summaries, and statistics.
|
||||
# - Test documents are (optionally) synced from an S3 bucket to a local directory before running the profiles.
|
||||
|
||||
# Dependencies:
|
||||
# - memray package for memory profiling and visualization.
|
||||
# - flameprof and snakeviz for time profiling and visualization.
|
||||
# - AWS CLI for syncing files from S3 (if applicable).
|
||||
|
||||
# Package dependencies can be installed with `pip install -r scripts/performance/requirements.txt`
|
||||
|
||||
# Usage example:
|
||||
# ./scripts/performance/profile.sh
|
||||
|
||||
# NOTE: because memray does not build wheels for ARM-Linux, this script can not run in an ARM Docker container on an M1 Mac (though emulated AMD would work).
|
||||
|
||||
|
||||
# Validate dependencies
|
||||
check_python_module() {
|
||||
if ! python3 -c "import $1" >/dev/null 2>&1; then
|
||||
echo "Error: Python module $1 is not installed. Please install required depencies with 'pip install -r scripts/performance/requirements.txt'."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
validate_dependencies() {
|
||||
check_python_module memray
|
||||
check_python_module flameprof
|
||||
}
|
||||
|
||||
# only validate in non-docker context (since we install dependencies on the fly in docker)
|
||||
if [[ "$DOCKER_TEST" != "true" ]]; then
|
||||
validate_dependencies
|
||||
fi
|
||||
|
||||
SCRIPT_DIR=$(dirname "$0")
|
||||
# Convert the relative path to module notation
|
||||
MODULE_PATH=${SCRIPT_DIR////.}
|
||||
# Remove the leading dot if it exists
|
||||
MODULE_PATH=${MODULE_PATH#.}
|
||||
# Remove the leading dot if it exists again
|
||||
MODULE_PATH=${MODULE_PATH#\.}
|
||||
|
||||
PROFILE_RESULTS_DIR="$SCRIPT_DIR/profile_results"
|
||||
|
||||
S3_BUCKET="utic-dev-tech-fixtures"
|
||||
S3_DOCS_DIR="performance-test/docs"
|
||||
|
||||
# Create PROFILE_RESULTS_DIR if it doesn't exist
|
||||
mkdir -p "$PROFILE_RESULTS_DIR"
|
||||
|
||||
if [[ "$SYNC_S3_DOCS" == "true" ]]; then
|
||||
# Sync files from S3 to the local "docs" directory
|
||||
aws s3 sync "s3://$S3_BUCKET/$S3_DOCS_DIR" "$SCRIPT_DIR/docs"
|
||||
fi
|
||||
|
||||
if [[ "$DOCKER_TEST" == "true" ]]; then
|
||||
SCRIPT_PARENT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
|
||||
docker run -it --rm -v "$SCRIPT_PARENT_DIR:/home/unstructured/scripts" unstructured:dev /bin/bash -c "
|
||||
cd unstructured/
|
||||
pip install -r scripts/performance/requirements.txt
|
||||
echo \"Warming the Docker container by running a small partitioning job..\"
|
||||
python3 -c 'from unstructured.partition.auto import partition; partition(\"'""$SCRIPT_DIR/warmup.pdf'\", strategy=\"hi_res\")[1]'
|
||||
./scripts/performance/profile.sh
|
||||
"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
check_display() {
|
||||
if system_profiler SPDisplaysDataType 2>/dev/null | grep -q "Display Type"; then
|
||||
return 0 # Display is present
|
||||
else
|
||||
return 1 # Display is not present (headless context)
|
||||
fi
|
||||
}
|
||||
|
||||
view_profile_headless() {
|
||||
# Several of the visualization options require a graphical interface. If DISPLAY is not set, we can't use those options.
|
||||
|
||||
extension=".bin"
|
||||
result_file="${result_file%.*}$extension"
|
||||
|
||||
if [[ ! -f "$result_file" ]]; then
|
||||
unset result_file # Unset the result_file variable to go back to the "Select a file" view
|
||||
echo "Result file not found. Please choose a different profile type or go back."
|
||||
else
|
||||
while true; do
|
||||
read -r -p "Choose visualization type: (1) tree (2) summary (3) stats (b) back, (q) quit: " -n 1 visualization_type
|
||||
echo
|
||||
|
||||
if [[ $visualization_type == "b" ]]; then
|
||||
unset result_file # Unset the result_file variable to go back to the "Select a file" view
|
||||
break
|
||||
elif [[ $visualization_type == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case $visualization_type in
|
||||
"1")
|
||||
python3 -m memray tree "$result_file"
|
||||
;;
|
||||
"2")
|
||||
python3 -m memray summary "$result_file"
|
||||
;;
|
||||
"3")
|
||||
python3 -m memray stats "$result_file"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid visualization type. Please try again."
|
||||
;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
view_profile_with_head() {
|
||||
while true; do
|
||||
read -r -p "Choose profile type: (1) time (2) memory (b) back, (q) quit: " -n 1 profile_type
|
||||
echo
|
||||
|
||||
if [[ $profile_type == "b" ]]; then
|
||||
unset result_file # Unset the result_file variable to go back to the "Select a file" view
|
||||
break
|
||||
elif [[ $profile_type == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ $profile_type == "1" ]]; then
|
||||
extension=".prof"
|
||||
elif [[ $profile_type == "2" ]]; then
|
||||
extension=".bin"
|
||||
else
|
||||
echo "Invalid profile type. Please try again."
|
||||
continue
|
||||
fi
|
||||
|
||||
result_file="${result_file%.*}$extension"
|
||||
|
||||
if [[ ! -f "$result_file" ]]; then
|
||||
echo "Result file not found. Please choose a different profile type or go back."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $profile_type == "2" ]]; then
|
||||
while true; do
|
||||
read -r -p "Choose visualization type: (1) flamegraph (2) table (3) tree (4) summary (5) stats (b) back, (q) quit: " -n 1 visualization_type
|
||||
echo
|
||||
|
||||
if [[ $visualization_type == "b" ]]; then
|
||||
break
|
||||
elif [[ $visualization_type == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case $visualization_type in
|
||||
"1")
|
||||
rm -f "${result_file}.memray.html"
|
||||
python3 -m memray flamegraph -o "${result_file}.memray.html" "$result_file"
|
||||
open "${result_file}.memray.html"
|
||||
;;
|
||||
"2")
|
||||
rm -f "${result_file}.table.html"
|
||||
python3 -m memray table -o "${result_file}.table.html" "$result_file"
|
||||
open "${result_file}.table.html"
|
||||
;;
|
||||
"3")
|
||||
python3 -m memray tree "$result_file"
|
||||
;;
|
||||
"4")
|
||||
python3 -m memray summary "$result_file"
|
||||
;;
|
||||
"5")
|
||||
python3 -m memray stats "$result_file"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid visualization type. Please try again."
|
||||
;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
while true; do
|
||||
read -r -p "Choose visualization type: (1) flamegraph (2) snakeviz (b) back, (q) quit: " -n 1 visualization_type
|
||||
echo
|
||||
|
||||
if [[ $visualization_type == "b" ]]; then
|
||||
break
|
||||
elif [[ $visualization_type == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case $visualization_type in
|
||||
"1")
|
||||
flameprof_file="${result_file}.flameprof.svg"
|
||||
rm -f "$flameprof_file"
|
||||
python3 -m flameprof "$result_file" > "$flameprof_file"
|
||||
open "$flameprof_file"
|
||||
;;
|
||||
"2")
|
||||
snakeviz "$result_file"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid visualization type. Please try again."
|
||||
;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
break # Return to the beginning
|
||||
done
|
||||
}
|
||||
|
||||
view_profile() {
|
||||
|
||||
if [ -n "$1" ]; then
|
||||
result_file="$1"
|
||||
fi
|
||||
while true; do
|
||||
if [[ -z $result_file ]]; then
|
||||
echo "Available result files:"
|
||||
result_files=("$PROFILE_RESULTS_DIR"/*.bin)
|
||||
if [[ ${#result_files[@]} -eq 0 ]]; then
|
||||
echo "No result files found."
|
||||
return
|
||||
fi
|
||||
|
||||
for ((i=0; i<${#result_files[@]}; i++)); do
|
||||
filename="${result_files[$i]##*/}"
|
||||
filename="${filename%.*}"
|
||||
echo "$i. $filename"
|
||||
done
|
||||
|
||||
read -r -p "Enter the number corresponding to the result file you want to view (b to go back, q to quit): " selection
|
||||
if [[ $selection == "b" ]]; then
|
||||
return
|
||||
elif [[ $selection == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
result_file="${result_files[$selection]}"
|
||||
fi
|
||||
|
||||
if check_display; then
|
||||
view_profile_with_head "$result_file"
|
||||
else
|
||||
view_profile_headless "$result_file"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
run_profile() {
|
||||
while true; do
|
||||
read -r -p "Choose an option: 1) Existing test file, (2) Custom file, (b) back, (q) quit: " -n 1 option
|
||||
echo
|
||||
|
||||
if [[ $option == "b" ]]; then
|
||||
return
|
||||
elif [[ $option == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ $option == "1" ]]; then
|
||||
echo "Available test files:"
|
||||
test_files=("$SCRIPT_DIR/docs"/*)
|
||||
if [[ ${#test_files[@]} -eq 0 ]]; then
|
||||
echo "No test files found."
|
||||
return
|
||||
fi
|
||||
|
||||
for ((i=0; i<${#test_files[@]}; i++)); do
|
||||
echo "$i. ${test_files[$i]}"
|
||||
done
|
||||
|
||||
read -r -p "Enter the number corresponding to the test file you want to run followed by return (b to go back, q to quit): " selection
|
||||
if [[ $selection == "b" ]]; then
|
||||
return
|
||||
elif [[ $selection == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
test_file="${test_files[$selection]}"
|
||||
elif [[ $option == "2" ]]; then
|
||||
read -r -p "Enter the path to the custom file: " test_file
|
||||
else
|
||||
echo "Invalid option. Please try again."
|
||||
continue
|
||||
fi
|
||||
|
||||
# Delete the output files if they exist
|
||||
rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.prof"
|
||||
rm -f "$PROFILE_RESULTS_DIR/${test_file##*/}.bin"
|
||||
|
||||
# Pick the strategy
|
||||
while true; do
|
||||
read -r -p "Choose a strategy: 1) auto, (2) fast, (3) hi_res, (b) back, (q) quit: " -n 1 strategy_option
|
||||
echo
|
||||
|
||||
if [[ $strategy_option == "b" ]]; then
|
||||
return
|
||||
elif [[ $strategy_option == "q" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case $strategy_option in
|
||||
"1")
|
||||
strategy="auto"
|
||||
break
|
||||
;;
|
||||
"2")
|
||||
strategy="fast"
|
||||
break
|
||||
;;
|
||||
"3")
|
||||
strategy="hi_res"
|
||||
break
|
||||
;;
|
||||
*)
|
||||
echo "Invalid strategy option. Please try again."
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Running time profile..."
|
||||
python3 -m cProfile -s cumulative -o "$PROFILE_RESULTS_DIR/${test_file##*/}.prof" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
|
||||
echo "Running memory profile..."
|
||||
python3 -m memray run -o "$PROFILE_RESULTS_DIR/${test_file##*/}.bin" -m "$MODULE_PATH.run_partition" "$test_file" "$strategy"
|
||||
echo "Profiling completed."
|
||||
echo "Viewing results for $test_file"
|
||||
result_file=$PROFILE_RESULTS_DIR/$(basename "$test_file")
|
||||
view_profile "${result_file}.bin" # Go directly to view mode
|
||||
done
|
||||
}
|
||||
|
||||
while true; do
|
||||
if [[ -n "$1" ]]; then
|
||||
mode="$1"
|
||||
fi
|
||||
|
||||
if [[ -z $result_file ]]; then
|
||||
read -r -p "Choose mode: (1) run, (2) view, (q) quit: " -n 1 mode
|
||||
echo
|
||||
fi
|
||||
|
||||
if [[ $mode == "1" ]]; then
|
||||
run_profile
|
||||
elif [[ $mode == "2" ]]; then
|
||||
unset result_file # Unset the result_file variable before entering the "View" mode
|
||||
view_profile
|
||||
elif [[ $mode == "q" ]]; then
|
||||
exit 0
|
||||
else
|
||||
echo "Invalid mode. Please choose 'view', 'run', or 'quit'."
|
||||
fi
|
||||
done
|
3
scripts/performance/requirements.txt
Normal file
3
scripts/performance/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
flameprof>=0.4
|
||||
memray>=1.7.0
|
||||
snakeviz>=2.2.0
|
17
scripts/performance/run_partition.py
Normal file
17
scripts/performance/run_partition.py
Normal file
@ -0,0 +1,17 @@
|
||||
import sys
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print(
|
||||
"Please provide the path to the file as the first argument and the strategy as the "
|
||||
"second argument.",
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
strategy = sys.argv[2]
|
||||
result = partition(file_path, strategy=strategy)
|
||||
# access element in the return value to make sure we got something back, otherwise error
|
||||
result[1]
|
BIN
scripts/performance/warmup.pdf
Normal file
BIN
scripts/performance/warmup.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user