unstructured/scripts/user/process-pdf-parallel-through-api.sh

#!/usr/bin/env bash

# Usage: ./process-pdf-parallel-through-api.sh filename.pdf

set -eu -o pipefail

if [ $# -ne 1 ]; then
  echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently."
  echo
  echo "Usage: $0 <pdf_filename>"
  echo "Please provide a PDF filename as the first argument."
  echo
  echo "Optionally, set the following env vars: "
  echo
  echo "* STRATEGY (default hi_res)"
  echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel"
  echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split"
  echo
  echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf"
  exit 1
fi

ALLOWED_STRATEGIES=("hi_res" "fast" "auto")

# Validate STRATEGY environment variable if it's set
if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then
  echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2
  exit 1
fi

# Check if UNST_API_KEY is set
if [ -z "${UNST_API_KEY}" ]; then
  echo "Error: UNST_API_KEY is not set or is empty" >&2
  exit 1
fi

PDF_FILE="$1"
DEFAULT_SPLIT_SIZE=10
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
PDF_NAME=$(basename "$PDF_FILE" .pdf)
DEFAULT_DIR="$HOME/tmp/pdf-splits"
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Check if PDF parts directory exists
if [ ! -d "$PDF_DIR" ]; then
  "$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE"
fi

# Create output directory if it does not exist
mkdir -p "$PDF_OUTPUT_DIR"

incomplete=0 # Flag to track incomplete processing

# Function to process a single PDF part file
process_file_part() {
  local file="$1"
  local STARTING_PAGE_NUMBER="$2"
  local OUTPUT_JSON="$3"

  if [ -f "$OUTPUT_JSON" ]; then
    echo "Skipping processing for $OUTPUT_JSON as it already exists."
    return
  fi

  curl -q -X POST https://api.unstructuredapp.io/general/v0/general \
    -H "unstructured-api-key: $UNST_API_KEY" \
    -H 'accept: application/json' \
    -H 'Content-Type: multipart/form-data' \
    -F strategy="${STRATEGY:-hi_res}" \
    -F 'skip_infer_table_types="[]"' \
    -F starting_page_number="$STARTING_PAGE_NUMBER" \
    -F files=@"$file;filename=$PDF_FILE" \
    -o "$OUTPUT_JSON"

  # Verify JSON content
  if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then
    echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file."
    cat "$OUTPUT_JSON"
    rm "$OUTPUT_JSON"
    incomplete=1
  else
    echo "Valid JSON output created: $OUTPUT_JSON"
  fi
}

# Function to process a batch of files
process_batch() {
  for file in "$@"; do
    local START_PAGE
    START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p')
    local END_PAGE=
    END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p')
    local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json"
    process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" &
  done
  wait
}

# Read PDF parts into an array
mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print)

# Process PDF parts in batches of 30, by default
batch_size=${BATCH_SIZE:-30}
for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do
  process_batch "${pdf_parts[@]:i:batch_size}"
done

# Determine the output filename based on whether processing was incomplete
if [ "$incomplete" -eq 1 ]; then
  combined_output_filename="${PDF_NAME}_incomplete_combined.json"
  echo "WARNING! not all json parts were successfully processed. you may rerun this script"
  echo "to attempt reprocessing those (failed to process) parts."
else
  combined_output_filename="${PDF_NAME}_combined.json"
fi

# Combine JSON outputs in numerical order
find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename"

echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename"
feat: utility script to process large PDFs through the API by script (#3591) Adds the bash script `process-pdf-parallel-through-api.sh` that allows splitting up a PDF into smaller parts (splits) to be processed through the API concurrently, and is re-entrant. If any of the parts splits fail to process, one can attempt reprocessing those split(s) by rerunning the script. Note: requires the `qpdf` command line utility. The below command line output shows the scenario where just one split had to be reprocessed through the API to create the final `layout-parser-paper_combined.json` output. ``` $ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \ ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf > % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\ er-paper_pages_1_to_6.json as it already exists. Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists. Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json ``` Bonus change to `unstructured-get-json.sh` to point to the standard hosted Serverless API, but allow using the Free API with --freemium. 2024-09-10 11:40:35 -07:00			`#!/usr/bin/env bash`

			`# Usage: ./process-pdf-parallel-through-api.sh filename.pdf`

			`set -eu -o pipefail`

			`if [ $# -ne 1 ]; then`
			`echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently."`
			`echo`
			`echo "Usage: $0 <pdf_filename>"`
			`echo "Please provide a PDF filename as the first argument."`
			`echo`
			`echo "Optionally, set the following env vars: "`
			`echo`
			`echo "* STRATEGY (default hi_res)"`
			`echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel"`
			`echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split"`
			`echo`
			`echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf"`
			`exit 1`
			`fi`

			`ALLOWED_STRATEGIES=("hi_res" "fast" "auto")`

			`# Validate STRATEGY environment variable if it's set`
			`if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then`
			`echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2`
			`exit 1`
			`fi`

			`# Check if UNST_API_KEY is set`
			`if [ -z "${UNST_API_KEY}" ]; then`
			`echo "Error: UNST_API_KEY is not set or is empty" >&2`
			`exit 1`
			`fi`

			`PDF_FILE="$1"`
			`DEFAULT_SPLIT_SIZE=10`
			`SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}`
			`PDF_NAME=$(basename "$PDF_FILE" .pdf)`
			`DEFAULT_DIR="$HOME/tmp/pdf-splits"`
			`PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"`
			`MD5_SUM=$(md5sum "$PDF_FILE" \| awk '{ print $1 }')`
			`PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"`
			`PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}"`

			`SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"`

			`# Check if PDF parts directory exists`
			`if [ ! -d "$PDF_DIR" ]; then`
			`"$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE"`
			`fi`

			`# Create output directory if it does not exist`
			`mkdir -p "$PDF_OUTPUT_DIR"`

			`incomplete=0 # Flag to track incomplete processing`

			`# Function to process a single PDF part file`
			`process_file_part() {`
			`local file="$1"`
			`local STARTING_PAGE_NUMBER="$2"`
			`local OUTPUT_JSON="$3"`

			`if [ -f "$OUTPUT_JSON" ]; then`
			`echo "Skipping processing for $OUTPUT_JSON as it already exists."`
			`return`
			`fi`

			`curl -q -X POST https://api.unstructuredapp.io/general/v0/general \`
			`-H "unstructured-api-key: $UNST_API_KEY" \`
			`-H 'accept: application/json' \`
			`-H 'Content-Type: multipart/form-data' \`
			`-F strategy="${STRATEGY:-hi_res}" \`
			`-F 'skip_infer_table_types="[]"' \`
			`-F starting_page_number="$STARTING_PAGE_NUMBER" \`
			`-F files=@"$file;filename=$PDF_FILE" \`
			`-o "$OUTPUT_JSON"`

			`# Verify JSON content`
			`if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then`
			`echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file."`
			`cat "$OUTPUT_JSON"`
			`rm "$OUTPUT_JSON"`
			`incomplete=1`
			`else`
			`echo "Valid JSON output created: $OUTPUT_JSON"`
			`fi`
			`}`

			`# Function to process a batch of files`
			`process_batch() {`
			`for file in "$@"; do`
			`local START_PAGE`
			`START_PAGE=$(echo "$file" \| sed -n 's/._pages_\([0-9]\)_to_[0-9]*.pdf/\1/p')`
			`local END_PAGE=`
			`END_PAGE=$(echo "$file" \| sed -n 's/._pages_[0-9]_to_\([0-9]*\).pdf/\1/p')`
			`local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json"`
			`process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" &`
			`done`
			`wait`
			`}`

			`# Read PDF parts into an array`
			`mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print)`

			`# Process PDF parts in batches of 30, by default`
			`batch_size=${BATCH_SIZE:-30}`
			`for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do`
			`process_batch "${pdf_parts[@]:i:batch_size}"`
			`done`

			`# Determine the output filename based on whether processing was incomplete`
			`if [ "$incomplete" -eq 1 ]; then`
			`combined_output_filename="${PDF_NAME}_incomplete_combined.json"`
			`echo "WARNING! not all json parts were successfully processed. you may rerun this script"`
			`echo "to attempt reprocessing those (failed to process) parts."`
			`else`
			`combined_output_filename="${PDF_NAME}_combined.json"`
			`fi`

			`# Combine JSON outputs in numerical order`
			`find "$PDF_OUTPUT_DIR" -name '*.json' -print0 \| sort -zV \| xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename"`

			`echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename"`