mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 12:03:15 +00:00 
			
		
		
		
	
		
			
	
	
		
			126 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
		
		
			
		
	
	
			126 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
| 
								 | 
							
								#!/usr/bin/env bash
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Usage: ./process-pdf-parallel-through-api.sh filename.pdf
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								set -eu -o pipefail
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if [ $# -ne 1 ]; then
							 | 
						||
| 
								 | 
							
								  echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently."
							 | 
						||
| 
								 | 
							
								  echo
							 | 
						||
| 
								 | 
							
								  echo "Usage: $0 <pdf_filename>"
							 | 
						||
| 
								 | 
							
								  echo "Please provide a PDF filename as the first argument."
							 | 
						||
| 
								 | 
							
								  echo
							 | 
						||
| 
								 | 
							
								  echo "Optionally, set the following env vars: "
							 | 
						||
| 
								 | 
							
								  echo
							 | 
						||
| 
								 | 
							
								  echo "* STRATEGY (default hi_res)"
							 | 
						||
| 
								 | 
							
								  echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel"
							 | 
						||
| 
								 | 
							
								  echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split"
							 | 
						||
| 
								 | 
							
								  echo
							 | 
						||
| 
								 | 
							
								  echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf"
							 | 
						||
| 
								 | 
							
								  exit 1
							 | 
						||
| 
								 | 
							
								fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								ALLOWED_STRATEGIES=("hi_res" "fast" "auto")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Validate STRATEGY environment variable if it's set
							 | 
						||
| 
								 | 
							
								if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then
							 | 
						||
| 
								 | 
							
								  echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2
							 | 
						||
| 
								 | 
							
								  exit 1
							 | 
						||
| 
								 | 
							
								fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Check if UNST_API_KEY is set
							 | 
						||
| 
								 | 
							
								if [ -z "${UNST_API_KEY}" ]; then
							 | 
						||
| 
								 | 
							
								  echo "Error: UNST_API_KEY is not set or is empty" >&2
							 | 
						||
| 
								 | 
							
								  exit 1
							 | 
						||
| 
								 | 
							
								fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								PDF_FILE="$1"
							 | 
						||
| 
								 | 
							
								DEFAULT_SPLIT_SIZE=10
							 | 
						||
| 
								 | 
							
								SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
							 | 
						||
| 
								 | 
							
								PDF_NAME=$(basename "$PDF_FILE" .pdf)
							 | 
						||
| 
								 | 
							
								DEFAULT_DIR="$HOME/tmp/pdf-splits"
							 | 
						||
| 
								 | 
							
								PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
							 | 
						||
| 
								 | 
							
								MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
							 | 
						||
| 
								 | 
							
								PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
							 | 
						||
| 
								 | 
							
								PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Check if PDF parts directory exists
							 | 
						||
| 
								 | 
							
								if [ ! -d "$PDF_DIR" ]; then
							 | 
						||
| 
								 | 
							
								  "$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE"
							 | 
						||
| 
								 | 
							
								fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Create output directory if it does not exist
							 | 
						||
| 
								 | 
							
								mkdir -p "$PDF_OUTPUT_DIR"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								incomplete=0 # Flag to track incomplete processing
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Function to process a single PDF part file
							 | 
						||
| 
								 | 
							
								process_file_part() {
							 | 
						||
| 
								 | 
							
								  local file="$1"
							 | 
						||
| 
								 | 
							
								  local STARTING_PAGE_NUMBER="$2"
							 | 
						||
| 
								 | 
							
								  local OUTPUT_JSON="$3"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if [ -f "$OUTPUT_JSON" ]; then
							 | 
						||
| 
								 | 
							
								    echo "Skipping processing for $OUTPUT_JSON as it already exists."
							 | 
						||
| 
								 | 
							
								    return
							 | 
						||
| 
								 | 
							
								  fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  curl -q -X POST https://api.unstructuredapp.io/general/v0/general \
							 | 
						||
| 
								 | 
							
								    -H "unstructured-api-key: $UNST_API_KEY" \
							 | 
						||
| 
								 | 
							
								    -H 'accept: application/json' \
							 | 
						||
| 
								 | 
							
								    -H 'Content-Type: multipart/form-data' \
							 | 
						||
| 
								 | 
							
								    -F strategy="${STRATEGY:-hi_res}" \
							 | 
						||
| 
								 | 
							
								    -F 'skip_infer_table_types="[]"' \
							 | 
						||
| 
								 | 
							
								    -F starting_page_number="$STARTING_PAGE_NUMBER" \
							 | 
						||
| 
								 | 
							
								    -F files=@"$file;filename=$PDF_FILE" \
							 | 
						||
| 
								 | 
							
								    -o "$OUTPUT_JSON"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  # Verify JSON content
							 | 
						||
| 
								 | 
							
								  if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then
							 | 
						||
| 
								 | 
							
								    echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file."
							 | 
						||
| 
								 | 
							
								    cat "$OUTPUT_JSON"
							 | 
						||
| 
								 | 
							
								    rm "$OUTPUT_JSON"
							 | 
						||
| 
								 | 
							
								    incomplete=1
							 | 
						||
| 
								 | 
							
								  else
							 | 
						||
| 
								 | 
							
								    echo "Valid JSON output created: $OUTPUT_JSON"
							 | 
						||
| 
								 | 
							
								  fi
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Function to process a batch of files
							 | 
						||
| 
								 | 
							
								process_batch() {
							 | 
						||
| 
								 | 
							
								  for file in "$@"; do
							 | 
						||
| 
								 | 
							
								    local START_PAGE
							 | 
						||
| 
								 | 
							
								    START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p')
							 | 
						||
| 
								 | 
							
								    local END_PAGE=
							 | 
						||
| 
								 | 
							
								    END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p')
							 | 
						||
| 
								 | 
							
								    local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json"
							 | 
						||
| 
								 | 
							
								    process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" &
							 | 
						||
| 
								 | 
							
								  done
							 | 
						||
| 
								 | 
							
								  wait
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Read PDF parts into an array
							 | 
						||
| 
								 | 
							
								mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Process PDF parts in batches of 30, by default
							 | 
						||
| 
								 | 
							
								batch_size=${BATCH_SIZE:-30}
							 | 
						||
| 
								 | 
							
								for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do
							 | 
						||
| 
								 | 
							
								  process_batch "${pdf_parts[@]:i:batch_size}"
							 | 
						||
| 
								 | 
							
								done
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Determine the output filename based on whether processing was incomplete
							 | 
						||
| 
								 | 
							
								if [ "$incomplete" -eq 1 ]; then
							 | 
						||
| 
								 | 
							
								  combined_output_filename="${PDF_NAME}_incomplete_combined.json"
							 | 
						||
| 
								 | 
							
								  echo "WARNING! not all json parts were successfully processed. you may rerun this script"
							 | 
						||
| 
								 | 
							
								  echo "to attempt reprocessing those (failed to process) parts."
							 | 
						||
| 
								 | 
							
								else
							 | 
						||
| 
								 | 
							
								  combined_output_filename="${PDF_NAME}_combined.json"
							 | 
						||
| 
								 | 
							
								fi
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Combine JSON outputs in numerical order
							 | 
						||
| 
								 | 
							
								find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename"
							 |