olmocr/scripts/remove_paths_from_olmocrmix.py
2025-09-30 17:37:10 +00:00

176 lines
6.5 KiB
Python

# This code takes in a path to an olmocr-mix processed data folder
# and removes any documents that come from a premade deny list
#
# Example deny list, txt file:
# s3://ai2-s2-pdfs/4444/111111111111111111111111111111111111.pdf
# s3://ai2-s2-pdfs/5555/111111111111111111111111111111111112.pdf
# s3://ai2-s2-pdfs/6666/111111111111111111111111111111111113.pdf
#
# Should match paths like
#
# processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.md
# processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.pdf
# Where the path to processed_00_documents_eval_s2pdf is provided as an argument
# What it should do is move the bad files to a rejected folder, match both .md and .pdf
import argparse
import re
import shutil
from pathlib import Path
from typing import Set, Tuple
from tqdm import tqdm
def parse_deny_list(deny_list_file: Path) -> Set[Tuple[str, str]]:
"""Parse deny list file and extract subdirectory and base filename patterns."""
patterns = set()
with open(deny_list_file, "r") as f:
for line in tqdm(f):
line = line.strip()
if not line or line.startswith("#"):
continue
# Extract the subdirectory (e.g., "4444") and base filename from S3 path
# Pattern: s3://bucket-name/subdirectory/filename.pdf
match = re.match(r"s3://[^/]+/(\S+)/([^/]+)\.pdf$", line)
if match:
subdir = match.group(1)
base_filename = match.group(2)
patterns.add((subdir, base_filename))
else:
print(f"Warning: Could not parse deny list entry: {line}")
return patterns
def find_matching_files(processed_dir: Path, deny_patterns: Set[Tuple[str, str]]) -> list[Path]:
"""Find all files that match the deny patterns."""
matching_files = []
# First, glob all .pdf and .md files in the processed directory
all_pdf_files = list(processed_dir.glob("**/*.pdf"))
all_md_files = list(processed_dir.glob("**/*.md"))
all_files = all_pdf_files + all_md_files
print(f"Found {len(all_files)} total files ({len(all_pdf_files)} PDFs, {len(all_md_files)} MDs)")
# Now check each file against the deny patterns
for file_path in tqdm(all_files, desc="Checking files against deny list"):
# Extract the parent directory name and base filename
# Expected pattern: processed_dir/subdir/filename-pagenum.ext
try:
relative_path = file_path.relative_to(processed_dir)
parts = relative_path.parts
if len(parts) >= 2:
subdir = parts[0]
filename = parts[-1]
# Extract base filename without page number and extension
# Pattern: base_filename-pagenum.ext
match = re.match(r"^(.+?)-\d+\.(pdf|md)$", filename)
if match:
base_filename = match.group(1)
# Check if this (subdir, base_filename) pair is in our deny set
if (subdir, base_filename) in deny_patterns:
matching_files.append(file_path)
except Exception as e:
print(f"Warning: Could not process file {file_path}: {e}")
return matching_files
def move_files_to_rejected(files_to_move: list[Path], processed_dir: Path, rejected_dir: Path):
"""Move files to the rejected folder, maintaining directory structure."""
moved_count = 0
for file_path in files_to_move:
# Calculate relative path from processed_dir
relative_path = file_path.relative_to(processed_dir)
# Create target path in rejected folder
target_path = rejected_dir / relative_path
# Create target directory if it doesn't exist
target_path.parent.mkdir(parents=True, exist_ok=True)
try:
# Move the file
shutil.move(str(file_path), str(target_path))
print(f"Moved: {file_path} -> {target_path}")
moved_count += 1
except Exception as e:
print(f"Error moving {file_path}: {e}")
return moved_count
def main():
parser = argparse.ArgumentParser(description="Remove documents from olmocr-mix processed data based on a deny list")
parser.add_argument("processed_dir", type=Path, help="Path to the processed documents directory (e.g., processed_00_documents_eval_s2pdf)")
parser.add_argument("deny_list", type=Path, help="Path to the deny list text file containing S3 paths to reject")
parser.add_argument("--rejected-dir", type=Path, default=None, help="Path to the rejected files directory (default: processed_dir_rejected)")
parser.add_argument("--dry-run", action="store_true", help="Show what would be moved without actually moving files")
args = parser.parse_args()
# Validate inputs
if not args.processed_dir.exists():
print(f"Error: Processed directory does not exist: {args.processed_dir}")
return 1
if not args.deny_list.exists():
print(f"Error: Deny list file does not exist: {args.deny_list}")
return 1
# Set rejected directory
if args.rejected_dir is None:
args.rejected_dir = args.processed_dir.parent / f"{args.processed_dir.name}_rejected"
print(f"Processing directory: {args.processed_dir}")
print(f"Deny list file: {args.deny_list}")
print(f"Rejected directory: {args.rejected_dir}")
if args.dry_run:
print("\n** DRY RUN MODE - No files will be moved **\n")
# Parse deny list
deny_patterns = parse_deny_list(args.deny_list)
print(f"\nFound {len(deny_patterns)} unique deny patterns")
# Find matching files
matching_files = find_matching_files(args.processed_dir, deny_patterns)
print(f"Found {len(matching_files)} files to remove")
if not matching_files:
print("No files to remove.")
return 0
# Show summary
print("\nFiles to be moved:")
for f in matching_files[:10]: # Show first 10
print(f" - {f}")
if len(matching_files) > 10:
print(f" ... and {len(matching_files) - 10} more files")
# Move files (or simulate in dry-run mode)
if not args.dry_run:
# Create rejected directory
args.rejected_dir.mkdir(parents=True, exist_ok=True)
# Move the files
moved_count = move_files_to_rejected(matching_files, args.processed_dir, args.rejected_dir)
print(f"\nSuccessfully moved {moved_count} files to {args.rejected_dir}")
else:
print(f"\nDry run complete. Would move {len(matching_files)} files.")
return 0
if __name__ == "__main__":
exit(main())