mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
176 lines
6.5 KiB
Python
176 lines
6.5 KiB
Python
# This code takes in a path to an olmocr-mix processed data folder
|
|
# and removes any documents that come from a premade deny list
|
|
#
|
|
# Example deny list, txt file:
|
|
# s3://ai2-s2-pdfs/4444/111111111111111111111111111111111111.pdf
|
|
# s3://ai2-s2-pdfs/5555/111111111111111111111111111111111112.pdf
|
|
# s3://ai2-s2-pdfs/6666/111111111111111111111111111111111113.pdf
|
|
#
|
|
# Should match paths like
|
|
#
|
|
# processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.md
|
|
# processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.pdf
|
|
# Where the path to processed_00_documents_eval_s2pdf is provided as an argument
|
|
|
|
# What it should do is move the bad files to a rejected folder, match both .md and .pdf
|
|
|
|
import argparse
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Set, Tuple
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
def parse_deny_list(deny_list_file: Path) -> Set[Tuple[str, str]]:
|
|
"""Parse deny list file and extract subdirectory and base filename patterns."""
|
|
patterns = set()
|
|
|
|
with open(deny_list_file, "r") as f:
|
|
for line in tqdm(f):
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
# Extract the subdirectory (e.g., "4444") and base filename from S3 path
|
|
# Pattern: s3://bucket-name/subdirectory/filename.pdf
|
|
match = re.match(r"s3://[^/]+/(\S+)/([^/]+)\.pdf$", line)
|
|
if match:
|
|
subdir = match.group(1)
|
|
base_filename = match.group(2)
|
|
patterns.add((subdir, base_filename))
|
|
else:
|
|
print(f"Warning: Could not parse deny list entry: {line}")
|
|
|
|
return patterns
|
|
|
|
|
|
def find_matching_files(processed_dir: Path, deny_patterns: Set[Tuple[str, str]]) -> list[Path]:
|
|
"""Find all files that match the deny patterns."""
|
|
matching_files = []
|
|
|
|
# First, glob all .pdf and .md files in the processed directory
|
|
all_pdf_files = list(processed_dir.glob("**/*.pdf"))
|
|
all_md_files = list(processed_dir.glob("**/*.md"))
|
|
all_files = all_pdf_files + all_md_files
|
|
|
|
print(f"Found {len(all_files)} total files ({len(all_pdf_files)} PDFs, {len(all_md_files)} MDs)")
|
|
|
|
# Now check each file against the deny patterns
|
|
for file_path in tqdm(all_files, desc="Checking files against deny list"):
|
|
# Extract the parent directory name and base filename
|
|
# Expected pattern: processed_dir/subdir/filename-pagenum.ext
|
|
try:
|
|
relative_path = file_path.relative_to(processed_dir)
|
|
parts = relative_path.parts
|
|
|
|
if len(parts) >= 2:
|
|
subdir = parts[0]
|
|
filename = parts[-1]
|
|
|
|
# Extract base filename without page number and extension
|
|
# Pattern: base_filename-pagenum.ext
|
|
match = re.match(r"^(.+?)-\d+\.(pdf|md)$", filename)
|
|
if match:
|
|
base_filename = match.group(1)
|
|
|
|
# Check if this (subdir, base_filename) pair is in our deny set
|
|
if (subdir, base_filename) in deny_patterns:
|
|
matching_files.append(file_path)
|
|
except Exception as e:
|
|
print(f"Warning: Could not process file {file_path}: {e}")
|
|
|
|
return matching_files
|
|
|
|
|
|
def move_files_to_rejected(files_to_move: list[Path], processed_dir: Path, rejected_dir: Path):
|
|
"""Move files to the rejected folder, maintaining directory structure."""
|
|
moved_count = 0
|
|
|
|
for file_path in files_to_move:
|
|
# Calculate relative path from processed_dir
|
|
relative_path = file_path.relative_to(processed_dir)
|
|
|
|
# Create target path in rejected folder
|
|
target_path = rejected_dir / relative_path
|
|
|
|
# Create target directory if it doesn't exist
|
|
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
# Move the file
|
|
shutil.move(str(file_path), str(target_path))
|
|
print(f"Moved: {file_path} -> {target_path}")
|
|
moved_count += 1
|
|
except Exception as e:
|
|
print(f"Error moving {file_path}: {e}")
|
|
|
|
return moved_count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Remove documents from olmocr-mix processed data based on a deny list")
|
|
parser.add_argument("processed_dir", type=Path, help="Path to the processed documents directory (e.g., processed_00_documents_eval_s2pdf)")
|
|
parser.add_argument("deny_list", type=Path, help="Path to the deny list text file containing S3 paths to reject")
|
|
parser.add_argument("--rejected-dir", type=Path, default=None, help="Path to the rejected files directory (default: processed_dir_rejected)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be moved without actually moving files")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate inputs
|
|
if not args.processed_dir.exists():
|
|
print(f"Error: Processed directory does not exist: {args.processed_dir}")
|
|
return 1
|
|
|
|
if not args.deny_list.exists():
|
|
print(f"Error: Deny list file does not exist: {args.deny_list}")
|
|
return 1
|
|
|
|
# Set rejected directory
|
|
if args.rejected_dir is None:
|
|
args.rejected_dir = args.processed_dir.parent / f"{args.processed_dir.name}_rejected"
|
|
|
|
print(f"Processing directory: {args.processed_dir}")
|
|
print(f"Deny list file: {args.deny_list}")
|
|
print(f"Rejected directory: {args.rejected_dir}")
|
|
|
|
if args.dry_run:
|
|
print("\n** DRY RUN MODE - No files will be moved **\n")
|
|
|
|
# Parse deny list
|
|
deny_patterns = parse_deny_list(args.deny_list)
|
|
print(f"\nFound {len(deny_patterns)} unique deny patterns")
|
|
|
|
# Find matching files
|
|
matching_files = find_matching_files(args.processed_dir, deny_patterns)
|
|
print(f"Found {len(matching_files)} files to remove")
|
|
|
|
if not matching_files:
|
|
print("No files to remove.")
|
|
return 0
|
|
|
|
# Show summary
|
|
print("\nFiles to be moved:")
|
|
for f in matching_files[:10]: # Show first 10
|
|
print(f" - {f}")
|
|
if len(matching_files) > 10:
|
|
print(f" ... and {len(matching_files) - 10} more files")
|
|
|
|
# Move files (or simulate in dry-run mode)
|
|
if not args.dry_run:
|
|
# Create rejected directory
|
|
args.rejected_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Move the files
|
|
moved_count = move_files_to_rejected(matching_files, args.processed_dir, args.rejected_dir)
|
|
print(f"\nSuccessfully moved {moved_count} files to {args.rejected_dir}")
|
|
else:
|
|
print(f"\nDry run complete. Would move {len(matching_files)} files.")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|