diff --git a/scripts/s2orc_extractor.sh b/scripts/s2orc_extractor.sh new file mode 100644 index 0000000..9c1e82b --- /dev/null +++ b/scripts/s2orc_extractor.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Define the output file for the metadata.sha1 fields +OUTPUT_FILE="s2orc_pdfs_v2.txt" + +# Clear the output file if it already exists +> "$OUTPUT_FILE" + +# Create a temporary directory for partial outputs +temp_output_dir=$(mktemp -d) + +# Ensure the temporary directory is cleaned up on exit or error +trap 'rm -rf "$temp_output_dir"' EXIT + +# Export the temporary output directory variable for use in xargs +export temp_output_dir + +echo "temp dir $temp_output_dir" + +# Find all .gz files recursively from the current directory +find 'split=train' -type f -name "*.gz" | \ + xargs -P 30 -I{} bash -c ' + gz_file="$1" + partial_output="$temp_output_dir/$(basename "$gz_file").txt" + + # Stream uncompressed data directly into jq and format the output + gunzip -c "$gz_file" | jq -r '"'"' + select(.metadata.sha1 != null) | + "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf" + '"'"' >> "$partial_output" + ' _ {} + +# Concatenate all partial outputs into the final output file +cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE" + +echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."