#!/usr/bin/env bash # TODO's # * ability to set file type so that is not inferred by the unstructured api service # e.g. "-F 'files=@foo.pdf;type=application/pdf' # set -e # shellcheck disable=SC2016 USAGE_MESSAGE="Usage: $0 [options] "' Options: --api-key KEY Specify the API key for authentication. Set the env var $UNST_API_KEY to skip providing this option. --freemium Use the free API rather paid API --hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. --vlm vlm strategy: Use Vision Language Model for processing --vlm-provider Specify the VLM model provider (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) --vlm-model Specify the VLM model when using (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) --tables Enable table extraction: tables are represented as html in metadata --images Include base64images in json --coordinates Include coordinates in the output --trace Enable trace logging for debugging, useful to cut and paste the executed curl call --verbose Enable verbose logging including printing first 8 elements to stdout --s3 Write the resulting output to s3 (like a pastebin) --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option. --open-html Automatically open HTML output in browser (macOS only) if --write-html. Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option. --help Display this help and exit. Arguments: File to send to the API. If running against an API instance other than hosted Unstructured paid API (or --freemium), set the enviornment variable UNST_API_ENDPOINT. The script requires a , the document to post to the Unstructured API. The .json result is written to ~/tmp/unst-outputs/ -- this path is echoed and copied to your clipboard. ' if [ "$#" -eq 0 ]; then echo "$USAGE_MESSAGE" exit 1 fi IMAGE_BLOCK_TYPES=${IMAGE_BLOCK_TYPES:-'"image", "table"'} API_KEY=${UNST_API_KEY:-""} TMP_DOWNLOADS_DIR=${UNST_SCRIPT_DOWNLOADS_DIR:-"$HOME/tmp/unst-downloads"} TMP_OUTPUTS_DIR=${UNST_SCRIPT_JSON_OUTPUTS_DIR:-"$HOME/tmp/unst-outputs"} # only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/ S3_URI_PREFIX=${UNST_S3_JSON_OUTPUT_URI:-""} # e.g. us-east-2, used to provide http links for above location S3_REGION=${UNST_S3_JSON_OUTPUT_REGION:-""} mkdir -p "$TMP_DOWNLOADS_DIR" mkdir -p "$TMP_OUTPUTS_DIR" copy_to_clipboard() { if [ "$(uname)" == "Darwin" ]; then # Join all arguments into a single string and copy to clipboard echo "$*" | pbcopy echo "copied to clipboard!" fi # TODO: add clipboard support for other OS's } HI_RES=false FAST=false OCR_ONLY=false VLM=false STRATEGY="" VERBOSE=false TRACE=false COORDINATES=false FREEMIUM=false TABLES=true IMAGES=false S3="" WRITE_HTML=${UNST_WRITE_HTML:-false} OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false} VLM_PROVIDER="" VLM_MODEL="" while [[ "$#" -gt 0 ]]; do case "$1" in --hi-res) HI_RES=true shift ;; --fast) FAST=true shift ;; --ocr-only) OCR_ONLY=true shift ;; --vlm) VLM=true shift ;; --vlm-provider) if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then VLM_PROVIDER=$2 shift 2 else echo "Error: Argument for $1 is missing" >&2 exit 1 fi ;; --vlm-model) if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then VLM_MODEL=$2 shift 2 else echo "Error: Argument for $1 is missing" >&2 exit 1 fi ;; --trace) TRACE=true shift ;; --verbose) VERBOSE=true shift ;; --s3) S3=true shift ;; --write-html) WRITE_HTML=true shift ;; --open-html) OPEN_HTML=true shift ;; --tables) TABLES=true shift ;; --images) IMAGES=true shift ;; --coordinates) COORDINATES=true shift ;; --freemium) FREEMIUM=true shift ;; --api-key) if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then API_KEY=$2 shift 2 else echo "Error: Argument for $1 is missing" >&2 exit 1 fi ;; --help) echo "$USAGE_MESSAGE" exit 0 ;; *) INPUT="$1" shift ;; esac done if [ -z "$INPUT" ]; then echo "Error: File or URL argument is missing." exit 1 fi # Check for strategy conflicts after all arguments are processed STRATEGY_COUNT=0 $HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) $FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) $OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) $VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) if [ "$STRATEGY_COUNT" -gt 1 ]; then echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time." exit 1 fi # Check if vlm-provider or vlm-model are provided without --vlm if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy." exit 1 fi if $TRACE; then set -x fi if [[ "$INPUT" =~ ^https?:// ]]; then FILENAME=$(basename "$INPUT") if $VERBOSE; then echo "Downloading $FILENAME $INPUT to "; fi INPUT_FILEPATH=${TMP_DOWNLOADS_DIR}/${FILENAME} curl -q -o "${OUTPUT_FILEPATH}" "$INPUT" echo "Downloaded file to ${OUTPUT_FILEPATH}" else FILENAME=$(basename "$INPUT") INPUT_FILEPATH=${INPUT} fi if $FREEMIUM; then API_ENDPOINT="https://api.unstructured.io/general/v0/general" else API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructuredapp.io/general/v0/general"} fi if $HI_RES; then if $VERBOSE; then echo "Sending API request with hi_res strategy"; fi STRATEGY="-hi-res" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=hi_res") elif $FAST; then if $VERBOSE; then echo "Sending API request with fast strategy"; fi STRATEGY="-fast" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=fast") elif $OCR_ONLY; then STRATEGY="-ocr-only" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=ocr_only") elif $VLM; then if $VERBOSE; then echo "Sending API request with vlm strategy"; fi STRATEGY="-vlm" # Add provider and model to filename if specified if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}" elif [ -n "$VLM_PROVIDER" ]; then STRATEGY="-vlm-${VLM_PROVIDER}" elif [ -n "$VLM_MODEL" ]; then STRATEGY="-vlm-model-${VLM_MODEL}" fi JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=vlm") if [ -n "$VLM_PROVIDER" ]; then CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER") fi if [ -n "$VLM_MODEL" ]; then CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL") fi else if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=() fi CURL_API_KEY=() [[ -n "$API_KEY" ]] && CURL_API_KEY=(-H "unstructured-api-key: $API_KEY") CURL_COORDINATES=() [[ "$COORDINATES" == "true" ]] && CURL_COORDINATES=(-F "coordinates=true") CURL_TABLES=() [[ "$TABLES" == "true" ]] && CURL_TABLES=(-F "skip_infer_table_types='[]'") CURL_IMAGES=() [[ "$IMAGES" == "true" ]] && CURL_IMAGES=(-F "extract_image_block_types=[$IMAGE_BLOCK_TYPES]") curl -q -X 'POST' \ "$API_ENDPOINT" \ "${CURL_API_KEY[@]}" -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ "${CURL_STRATEGY[@]}" "${CURL_COORDINATES[@]}" "${CURL_TABLES[@]}" "${CURL_IMAGES[@]}" -F "files=@${INPUT_FILEPATH}" \ -o "${JSON_OUTPUT_FILEPATH}" JSON_FILE_SIZE=$(wc -c <"${JSON_OUTPUT_FILEPATH}") if [ "$JSON_FILE_SIZE" -lt 10 ]; then echo "Error: JSON file ${JSON_OUTPUT_FILEPATH} has no elements." cat "$JSON_OUTPUT_FILEPATH" exit 1 else # shellcheck disable=SC2046 if $VERBOSE; then echo "first 8 elements: " jq '.[0:8]' "${JSON_OUTPUT_FILEPATH}" fi # shellcheck disable=SC2046 echo "total number of elements: " $(jq 'length' "${JSON_OUTPUT_FILEPATH}") fi echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}" # Convert JSON to HTML if requested if [ "$WRITE_HTML" = true ]; then HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html if $VLM; then # VLM output has all metadata.text_as_html fields defined, so # create HTML directly from the metadata.text_as_html fields { echo "" echo "" echo "" echo " " echo " " echo " ${FILENAME}" echo " " echo "" echo "" jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}" echo "" echo "" } >"${HTML_OUTPUT_FILEPATH}" echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}" else # most elements will not have metadata.text_as_html defined (by design on Table elements do), # so use the unstructured library's python script for the conversion. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}" echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}" fi # Open HTML file in browser if requested and on macOS if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then open "${HTML_OUTPUT_FILEPATH}" fi fi # write .json output to s3 location if [ -n "$S3" ]; then if [ -z "$S3_URI_PREFIX" ]; then echo echo "You must define your s3 output location in the env var UNST_S3_JSON_OUTPUT_URI" echo "e.g. UNST_S3_JSON_OUTPUT_URI='s3://bucket/path/'" exit 0 elif [ -z "$S3_REGION" ]; then echo echo "You must define your s3 region in the env var UNST_S3_JSON_OUTPUT_REGION" echo "e.g. UNST_S3_JSON_OUTPUT_REGION=us-west-2" exit 0 fi SHA_SUM_PREFIX=$(sha256sum "${JSON_OUTPUT_FILEPATH}" | cut -c1-7) CURRENT_TIMESTAMP=$(date -u +%s) APR27_2023_TIMESTAMP=$(date -u -d "2023-04-27 00:00:00" +%s) TENS_OF_SECS_SINCE_APR27_2023=$(((CURRENT_TIMESTAMP - APR27_2023_TIMESTAMP) / 10)) S3_UPLOAD_PATH="${S3_URI_PREFIX}${TENS_OF_SECS_SINCE_APR27_2023}-${SHA_SUM_PREFIX}${STRATEGY}/${FILENAME}.json" if $VERBOSE; then echo "Uploading JSON to S3"; fi aws s3 cp "${JSON_OUTPUT_FILEPATH}" "$S3_UPLOAD_PATH" BUCKET=$(echo "$S3_UPLOAD_PATH" | cut -d/ -f3) KEY=$(echo "$S3_UPLOAD_PATH" | cut -d/ -f4-) HTTPS_URL="https://${BUCKET}.s3.us-east-2.amazonaws.com/${KEY}" echo "s3 location: ${S3_UPLOAD_PATH}" echo "link: $HTTPS_URL" copy_to_clipboard "$HTTPS_URL" else copy_to_clipboard "${JSON_OUTPUT_FILEPATH}" fi