mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	Given the tendency for shell scripts to easily enter into a few levels of indentation and long line lengths, update the default to 2 spaces.
		
			
				
	
	
		
			59 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			59 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env bash
 | 
						|
 | 
						|
# Structured .json output from PDF's or images may differ subtly (or not so subtly)
 | 
						|
# based on the version of tesseract, its dependencies, and chip architecture.
 | 
						|
#
 | 
						|
# To update ingest-test expected outputs (structured .json files), this script:
 | 
						|
#   * builds an ubuntu image that
 | 
						|
#      * matches CI with respect to tesseract and OS deps
 | 
						|
#      * installs python dependencies from the local requirements/ directory
 | 
						|
#   * runs each test ingest script with OVERWRITE_FIXTURES=true
 | 
						|
#      * so updates are written to test_unstructured_ingest/expected-structured-output/
 | 
						|
#      * using local unstructured/ directory (i.e. from local git branch)
 | 
						|
#
 | 
						|
# It is recommended to run this script on x86_64 hardware.
 | 
						|
 | 
						|
set -eu -o pipefail
 | 
						|
 | 
						|
# Change to the root of the repository
 | 
						|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 | 
						|
cd "$SCRIPT_DIR"/.. || exit 1
 | 
						|
 | 
						|
ARCHITECTURE=$(uname -m)
 | 
						|
 | 
						|
if [ "$ARCHITECTURE" != "x86_64" ]; then
 | 
						|
  echo "Warning: This script is designed to run on x86_64 hardware, but you're running on $ARCHITECTURE."
 | 
						|
fi
 | 
						|
 | 
						|
./scripts/docker-build-ubuntu.sh
 | 
						|
 | 
						|
# Warn the user if they have an old image
 | 
						|
IMAGE_NAME="unstructured-ubuntu:latest"
 | 
						|
CREATION_TIMESTAMP=$(docker inspect --format='{{.Created}}' "$IMAGE_NAME")
 | 
						|
CREATION_DATE=$(date -d "$CREATION_TIMESTAMP" +%s)
 | 
						|
CURRENT_DATE=$(date +%s)
 | 
						|
AGE_DAYS=$(((CURRENT_DATE - CREATION_DATE) / 86400))
 | 
						|
if [ "$AGE_DAYS" -gt 6 ]; then
 | 
						|
  echo "WARNING: The image \"$IMAGE_NAME\" is more than 7 days old ($AGE_DAYS days)."
 | 
						|
  echo "You may want to 'docker rmi $IMAGE_NAME' and rerun this script if it is not current."
 | 
						|
fi
 | 
						|
 | 
						|
docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured \
 | 
						|
  -v "$SCRIPT_DIR"/../test_unstructured_ingest:/root/test_unstructured_ingest \
 | 
						|
  ${DISCORD_TOKEN:+-e DISCORD_TOKEN="$DISCORD_TOKEN"} \
 | 
						|
  ${SLACK_TOKEN:+-e SLACK_TOKEN="$SLACK_TOKEN"} \
 | 
						|
  ${CONFLUENCE_USER_EMAIL:+-e CONFLUENCE_USER_EMAIL="$CONFLUENCE_USER_EMAIL"} \
 | 
						|
  ${CONFLUENCE_API_TOKEN:+-e CONFLUENCE_API_TOKEN="$CONFLUENCE_API_TOKEN"} \
 | 
						|
  ${GH_READ_ONLY_ACCESS_TOKEN:+-e GH_READ_ONLY_ACCESS_TOKEN="$GH_READ_ONLY_ACCESS_TOKEN"} \
 | 
						|
  -w /root "$IMAGE_NAME" \
 | 
						|
  bash -c "export OVERWRITE_FIXTURES=true && source ~/.bashrc && pyenv activate unstructured && tesseract --version &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-azure.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-discord.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-github.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-biomed-api.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-biomed-path.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-s3.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-slack.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh &&
 | 
						|
               ./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh"
 |