From 238f985ddaaa04952ac089c6e94e592de2bda9b6 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 27 Jan 2025 16:09:13 -0800 Subject: [PATCH] feat: add --images support to unstructured-get-json.sh (#3888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit E.g., now can run: ```bash # extracts base64 encoded image data for `Table` and `Image` elements $ unstructured-get-json.sh --trace --verbose --images /t/docs/Captur-1317-5_ENG-p5.pdf # also extracts `Title` elements (see screenshot) $ IMAGE_BLOCK_TYPES='"title","table","image"' unstructured-get-json.sh --trace --verbose --images /t/docs/Captur-1317-5_ENG-p5.pdf ``` It was discovered during testing that "narrativetext" does not work, probably due to camel casing of NarrativeText 😬 ![image](https://github.com/user-attachments/assets/e6414a57-81e1-4560-b1b2-dce3b1c2c804) --- scripts/user/unstructured-get-json.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index bd2221d51..74ea03139 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -17,6 +17,7 @@ Options: --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. --tables Enable table extraction: tables are represented as html in metadata + --images Include base64images in json --coordinates Include coordinates in the output --trace Enable trace logging for debugging, useful to cut and paste the executed curl call --verbose Enable verbose logging including printing first 8 elements to stdout @@ -39,6 +40,7 @@ if [ "$#" -eq 0 ]; then exit 1 fi +IMAGE_BLOCK_TYPES=${IMAGE_BLOCK_TYPES:-'"image", "table"'} API_KEY=${UNST_API_KEY:-""} TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads" TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs" @@ -68,6 +70,7 @@ TRACE=false COORDINATES=false FREEMIUM=false TABLES=true +IMAGES=false S3="" while [[ "$#" -gt 0 ]]; do @@ -100,6 +103,10 @@ while [[ "$#" -gt 0 ]]; do TABLES=true shift ;; + --images) + IMAGES=true + shift + ;; --coordinates) COORDINATES=true shift @@ -180,12 +187,14 @@ CURL_COORDINATES=() [[ "$COORDINATES" == "true" ]] && CURL_COORDINATES=(-F "coordinates=true") CURL_TABLES=() [[ "$TABLES" == "true" ]] && CURL_TABLES=(-F "skip_infer_table_types='[]'") +CURL_IMAGES=() +[[ "$IMAGES" == "true" ]] && CURL_IMAGES=(-F "extract_image_block_types=[$IMAGE_BLOCK_TYPES]") curl -q -X 'POST' \ "$API_ENDPOINT" \ "${CURL_API_KEY[@]}" -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ - "${CURL_STRATEGY[@]}" "${CURL_COORDINATES[@]}" "${CURL_TABLES[@]}" -F "files=@${INPUT_FILEPATH}" \ + "${CURL_STRATEGY[@]}" "${CURL_COORDINATES[@]}" "${CURL_TABLES[@]}" "${CURL_IMAGES[@]}" -F "files=@${INPUT_FILEPATH}" \ -o "${JSON_OUTPUT_FILEPATH}" JSON_FILE_SIZE=$(wc -c <"${JSON_OUTPUT_FILEPATH}")