mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-05 03:31:13 +00:00
S2orc dataset extractor
This commit is contained in:
parent
302eee3da5
commit
31becaf7e4
36
scripts/s2orc_extractor.sh
Normal file
36
scripts/s2orc_extractor.sh
Normal file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Define the output file for the metadata.sha1 fields
|
||||
OUTPUT_FILE="s2orc_pdfs_v2.txt"
|
||||
|
||||
# Clear the output file if it already exists
|
||||
> "$OUTPUT_FILE"
|
||||
|
||||
# Create a temporary directory for partial outputs
|
||||
temp_output_dir=$(mktemp -d)
|
||||
|
||||
# Ensure the temporary directory is cleaned up on exit or error
|
||||
trap 'rm -rf "$temp_output_dir"' EXIT
|
||||
|
||||
# Export the temporary output directory variable for use in xargs
|
||||
export temp_output_dir
|
||||
|
||||
echo "temp dir $temp_output_dir"
|
||||
|
||||
# Find all .gz files recursively from the current directory
|
||||
find 'split=train' -type f -name "*.gz" | \
|
||||
xargs -P 30 -I{} bash -c '
|
||||
gz_file="$1"
|
||||
partial_output="$temp_output_dir/$(basename "$gz_file").txt"
|
||||
|
||||
# Stream uncompressed data directly into jq and format the output
|
||||
gunzip -c "$gz_file" | jq -r '"'"'
|
||||
select(.metadata.sha1 != null) |
|
||||
"s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
|
||||
'"'"' >> "$partial_output"
|
||||
' _ {}
|
||||
|
||||
# Concatenate all partial outputs into the final output file
|
||||
cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
|
||||
|
||||
echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."
|
||||
Loading…
x
Reference in New Issue
Block a user