mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 10:04:26 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			37 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
			
		
		
	
	
			37 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| # Define the output file for the metadata.sha1 fields
 | |
| OUTPUT_FILE="s2orc_pdfs_v2.txt"
 | |
| 
 | |
| # Clear the output file if it already exists
 | |
| > "$OUTPUT_FILE"
 | |
| 
 | |
| # Create a temporary directory for partial outputs
 | |
| temp_output_dir=$(mktemp -d)
 | |
| 
 | |
| # Ensure the temporary directory is cleaned up on exit or error
 | |
| trap 'rm -rf "$temp_output_dir"' EXIT
 | |
| 
 | |
| # Export the temporary output directory variable for use in xargs
 | |
| export temp_output_dir
 | |
| 
 | |
| echo "temp dir $temp_output_dir"
 | |
| 
 | |
| # Find all .gz files recursively from the current directory
 | |
| find 'split=train' -type f -name "*.gz" | \
 | |
|     xargs -P 30 -I{} bash -c '
 | |
|         gz_file="$1"
 | |
|         partial_output="$temp_output_dir/$(basename "$gz_file").txt"
 | |
| 
 | |
|         # Stream uncompressed data directly into jq and format the output
 | |
|         gunzip -c "$gz_file" | jq -r '"'"'
 | |
|             select(.metadata.sha1 != null) |
 | |
|             "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
 | |
|         '"'"' >> "$partial_output"
 | |
|     ' _ {}
 | |
| 
 | |
| # Concatenate all partial outputs into the final output file
 | |
| cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
 | |
| 
 | |
| echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."
 | 
