unstructured/scripts/user/split-pdf.sh

#!/usr/bin/env bash

# Usage: ./split_pdf.sh filename.pdf

set -e

PDF_FILE="$1"
DEFAULT_SPLIT_SIZE=5
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}

# Validate that SPLIT_SIZE is an integer
if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then
  echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer."
  exit 1
fi

DEFAULT_DIR="$HOME/tmp/pdf-splits"
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
PDF_NAME=$(basename "$PDF_FILE" .pdf)
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"

# Create directory if it does not exist
mkdir -p "$PDF_DIR"

# Total number of pages
TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE")

# Split PDF into $SPLIT_SIZE-page chunks
START_PAGE=1
while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do
  END_PAGE=$((START_PAGE + SPLIT_SIZE - 1))
  if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then
    END_PAGE=$TOTAL_PAGES
  fi
  OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf"
  qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE"
  echo "Created $OUTPUT_FILE"
  START_PAGE=$((END_PAGE + 1))
done

echo "All parts have been saved to $PDF_DIR"
feat: utility script to process large PDFs through the API by script (#3591) Adds the bash script `process-pdf-parallel-through-api.sh` that allows splitting up a PDF into smaller parts (splits) to be processed through the API concurrently, and is re-entrant. If any of the parts splits fail to process, one can attempt reprocessing those split(s) by rerunning the script. Note: requires the `qpdf` command line utility. The below command line output shows the scenario where just one split had to be reprocessed through the API to create the final `layout-parser-paper_combined.json` output. ``` $ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \ ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf > % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\ er-paper_pages_1_to_6.json as it already exists. Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists. Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json ``` Bonus change to `unstructured-get-json.sh` to point to the standard hosted Serverless API, but allow using the Free API with --freemium. 2024-09-10 11:40:35 -07:00			`#!/usr/bin/env bash`

			`# Usage: ./split_pdf.sh filename.pdf`

			`set -e`

			`PDF_FILE="$1"`
			`DEFAULT_SPLIT_SIZE=5`
			`SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}`

			`# Validate that SPLIT_SIZE is an integer`
			`if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then`
			`echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer."`
			`exit 1`
			`fi`

			`DEFAULT_DIR="$HOME/tmp/pdf-splits"`
			`PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"`
			`PDF_NAME=$(basename "$PDF_FILE" .pdf)`
			`MD5_SUM=$(md5sum "$PDF_FILE" \| awk '{ print $1 }')`
			`PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"`

			`# Create directory if it does not exist`
			`mkdir -p "$PDF_DIR"`

			`# Total number of pages`
			`TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE")`

			`# Split PDF into $SPLIT_SIZE-page chunks`
			`START_PAGE=1`
			`while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do`
			`END_PAGE=$((START_PAGE + SPLIT_SIZE - 1))`
			`if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then`
			`END_PAGE=$TOTAL_PAGES`
			`fi`
			`OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf"`
			`qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE"`
			`echo "Created $OUTPUT_FILE"`
			`START_PAGE=$((END_PAGE + 1))`
			`done`

			`echo "All parts have been saved to $PDF_DIR"`