mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 bd8a74d686
			
		
	
	
		bd8a74d686
		
			
		
	
	
	
	
		
			
			Given the tendency for shell scripts to easily enter into a few levels of indentation and long line lengths, update the default to 2 spaces.
		
			
				
	
	
		
			69 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			69 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env bash
 | |
| 
 | |
| # TODO:
 | |
| # * include the page number in output html filename, maybe all tables per page
 | |
| #   in one html page.
 | |
| 
 | |
| set -e
 | |
| 
 | |
| USAGE_MESSAGE="Usage: $0 <file>
 | |
| 
 | |
| Requires an unstructured output .json file as the only argument.
 | |
| 
 | |
| Each table in the file is saved as an individual html file and
 | |
| opened in Safari (if running on mac), providing a quick and easy
 | |
| way to see the structure and content of each Table element.
 | |
| "
 | |
| 
 | |
| # Check for the presence of an argument
 | |
| if [ "$#" -ne 1 ]; then
 | |
|   echo "$USAGE_MESSAGE"
 | |
|   exit 1
 | |
| fi
 | |
| 
 | |
| # The JSON file to be processed is the first argument
 | |
| JSON_FILE="$1"
 | |
| 
 | |
| # Extract the basename without the extension
 | |
| BASE_NAME=$(basename "$JSON_FILE" .json)
 | |
| 
 | |
| # Directory where the files will be saved
 | |
| TMP_TABLES_OUTPUTS_DIR="$HOME/tmp/tables-out"
 | |
| 
 | |
| # Check if the directory exists, if not create it
 | |
| if [ ! -d "$TMP_TABLES_OUTPUTS_DIR" ]; then
 | |
|   mkdir -p "$TMP_TABLES_OUTPUTS_DIR"
 | |
| fi
 | |
| 
 | |
| # Counter for the table files
 | |
| COUNTER=1
 | |
| 
 | |
| # Parsing the JSON and creating HTML files
 | |
| jq -c '.[] | select(.type == "Table") | .metadata.text_as_html' "$JSON_FILE" | while read -r HTML_CONTENT; do
 | |
|   # Remove leading and trailing quotes from the JSON string
 | |
|   HTML_CONTENT=${HTML_CONTENT#\"}
 | |
|   HTML_CONTENT=${HTML_CONTENT%\"}
 | |
|   # add a border and padding to clearly see cell definition
 | |
|   # shellcheck disable=SC2001
 | |
|   HTML_CONTENT=$(echo "$HTML_CONTENT" | sed 's/<table>/<table border="1" cellpadding="10">/')
 | |
|   # add newlines for readability in the html
 | |
|   # shellcheck disable=SC2001
 | |
|   HTML_CONTENT=$(echo "$HTML_CONTENT" | sed 's/>\s*</>\n</g')
 | |
| 
 | |
|   # Create filename based on the basename and counter
 | |
|   HTML_FILENAME="$TMP_TABLES_OUTPUTS_DIR/${BASE_NAME}-${COUNTER}.html"
 | |
| 
 | |
|   # Increment the counter
 | |
|   COUNTER=$((COUNTER + 1))
 | |
| 
 | |
|   # Save the HTML content to a file
 | |
|   echo "$HTML_CONTENT" >"$HTML_FILENAME"
 | |
| 
 | |
|   if [ "$(uname)" == "Darwin" ]; then
 | |
|     # Open the file in a new browser window
 | |
|     open -a "Safari" "$HTML_FILENAME" &
 | |
|   else
 | |
|     echo "$HTML_FILENAME"
 | |
|   fi
 | |
| done
 |