S2orc dataset extractor

2025-12-13 00:09:42 +00:00 · 2024-10-21 21:28:44 +00:00 · 2024-10-21 21:28:44 +00:00 · 31becaf7e4
commit 31becaf7e4
parent 302eee3da5
1 changed files with 36 additions and 0 deletions
--- a/scripts/s2orc_extractor.sh
+++ b/scripts/s2orc_extractor.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Define the output file for the metadata.sha1 fields
+OUTPUT_FILE="s2orc_pdfs_v2.txt"
+
+# Clear the output file if it already exists
+> "$OUTPUT_FILE"
+
+# Create a temporary directory for partial outputs
+temp_output_dir=$(mktemp -d)
+
+# Ensure the temporary directory is cleaned up on exit or error
+trap 'rm -rf "$temp_output_dir"' EXIT
+
+# Export the temporary output directory variable for use in xargs
+export temp_output_dir
+
+echo "temp dir $temp_output_dir"
+
+# Find all .gz files recursively from the current directory
+find 'split=train' -type f -name "*.gz" | \
+    xargs -P 30 -I{} bash -c '
+        gz_file="$1"
+        partial_output="$temp_output_dir/$(basename "$gz_file").txt"
+
+        # Stream uncompressed data directly into jq and format the output
+        gunzip -c "$gz_file" | jq -r '"'"'
+            select(.metadata.sha1 != null) |
+            "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
+        '"'"' >> "$partial_output"
+    ' _ {}
+
+# Concatenate all partial outputs into the final output file
+cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
+
+echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."