mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 5052e6cb3b
			
		
	
	
		5052e6cb3b
		
			
		
	
	
	
	
		
			
			This PR adds a comparison during ingest test for the content of the files in plain text (i.e.: without JSON format)
		
			
				
	
	
		
			14 lines
		
	
	
		
			414 B
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			14 lines
		
	
	
		
			414 B
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env bash
 | |
| 
 | |
| # Clean the content of json file generated by unstructured library, storing just 
 | |
| # text elements. The resulting file will be stored at the $2 folder with the same
 | |
| # name as the original file appending .txt as suffix.
 | |
| # Arguments:
 | |
| # - $1 path to the file to clean
 | |
| # - $2 path to folder to store the result
 | |
| # 
 | |
| 
 | |
| BASE=$(basename "$1")
 | |
| DEST=$2/$BASE.txt
 | |
| jq '.[].text'<"$1"|fold -w 80 -s > "$DEST"
 |