mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 01:55:06 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			176 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			176 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # This code takes in a path to an olmocr-mix processed data folder
 | |
| # and removes any documents that come from a premade deny list
 | |
| #
 | |
| # Example deny list, txt file:
 | |
| # s3://ai2-s2-pdfs/4444/111111111111111111111111111111111111.pdf
 | |
| # s3://ai2-s2-pdfs/5555/111111111111111111111111111111111112.pdf
 | |
| # s3://ai2-s2-pdfs/6666/111111111111111111111111111111111113.pdf
 | |
| #
 | |
| # Should match paths like
 | |
| #
 | |
| # processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.md
 | |
| # processed_00_documents_eval_s2pdf/4444/111111111111111111111111111111111111-1.pdf
 | |
| # Where the path to processed_00_documents_eval_s2pdf is provided as an argument
 | |
| 
 | |
| # What it should do is move the bad files to a rejected folder, match both .md and .pdf
 | |
| 
 | |
| import argparse
 | |
| import re
 | |
| import shutil
 | |
| from pathlib import Path
 | |
| from typing import Set, Tuple
 | |
| 
 | |
| from tqdm import tqdm
 | |
| 
 | |
| 
 | |
| def parse_deny_list(deny_list_file: Path) -> Set[Tuple[str, str]]:
 | |
|     """Parse deny list file and extract subdirectory and base filename patterns."""
 | |
|     patterns = set()
 | |
| 
 | |
|     with open(deny_list_file, "r") as f:
 | |
|         for line in tqdm(f):
 | |
|             line = line.strip()
 | |
|             if not line or line.startswith("#"):
 | |
|                 continue
 | |
| 
 | |
|             # Extract the subdirectory (e.g., "4444") and base filename from S3 path
 | |
|             # Pattern: s3://bucket-name/subdirectory/filename.pdf
 | |
|             match = re.match(r"s3://[^/]+/(\S+)/([^/]+)\.pdf$", line)
 | |
|             if match:
 | |
|                 subdir = match.group(1)
 | |
|                 base_filename = match.group(2)
 | |
|                 patterns.add((subdir, base_filename))
 | |
|             else:
 | |
|                 print(f"Warning: Could not parse deny list entry: {line}")
 | |
| 
 | |
|     return patterns
 | |
| 
 | |
| 
 | |
| def find_matching_files(processed_dir: Path, deny_patterns: Set[Tuple[str, str]]) -> list[Path]:
 | |
|     """Find all files that match the deny patterns."""
 | |
|     matching_files = []
 | |
| 
 | |
|     # First, glob all .pdf and .md files in the processed directory
 | |
|     all_pdf_files = list(processed_dir.glob("**/*.pdf"))
 | |
|     all_md_files = list(processed_dir.glob("**/*.md"))
 | |
|     all_files = all_pdf_files + all_md_files
 | |
| 
 | |
|     print(f"Found {len(all_files)} total files ({len(all_pdf_files)} PDFs, {len(all_md_files)} MDs)")
 | |
| 
 | |
|     # Now check each file against the deny patterns
 | |
|     for file_path in tqdm(all_files, desc="Checking files against deny list"):
 | |
|         # Extract the parent directory name and base filename
 | |
|         # Expected pattern: processed_dir/subdir/filename-pagenum.ext
 | |
|         try:
 | |
|             relative_path = file_path.relative_to(processed_dir)
 | |
|             parts = relative_path.parts
 | |
| 
 | |
|             if len(parts) >= 2:
 | |
|                 subdir = parts[0]
 | |
|                 filename = parts[-1]
 | |
| 
 | |
|                 # Extract base filename without page number and extension
 | |
|                 # Pattern: base_filename-pagenum.ext
 | |
|                 match = re.match(r"^(.+?)-\d+\.(pdf|md)$", filename)
 | |
|                 if match:
 | |
|                     base_filename = match.group(1)
 | |
| 
 | |
|                     # Check if this (subdir, base_filename) pair is in our deny set
 | |
|                     if (subdir, base_filename) in deny_patterns:
 | |
|                         matching_files.append(file_path)
 | |
|         except Exception as e:
 | |
|             print(f"Warning: Could not process file {file_path}: {e}")
 | |
| 
 | |
|     return matching_files
 | |
| 
 | |
| 
 | |
| def move_files_to_rejected(files_to_move: list[Path], processed_dir: Path, rejected_dir: Path):
 | |
|     """Move files to the rejected folder, maintaining directory structure."""
 | |
|     moved_count = 0
 | |
| 
 | |
|     for file_path in files_to_move:
 | |
|         # Calculate relative path from processed_dir
 | |
|         relative_path = file_path.relative_to(processed_dir)
 | |
| 
 | |
|         # Create target path in rejected folder
 | |
|         target_path = rejected_dir / relative_path
 | |
| 
 | |
|         # Create target directory if it doesn't exist
 | |
|         target_path.parent.mkdir(parents=True, exist_ok=True)
 | |
| 
 | |
|         try:
 | |
|             # Move the file
 | |
|             shutil.move(str(file_path), str(target_path))
 | |
|             print(f"Moved: {file_path} -> {target_path}")
 | |
|             moved_count += 1
 | |
|         except Exception as e:
 | |
|             print(f"Error moving {file_path}: {e}")
 | |
| 
 | |
|     return moved_count
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     parser = argparse.ArgumentParser(description="Remove documents from olmocr-mix processed data based on a deny list")
 | |
|     parser.add_argument("processed_dir", type=Path, help="Path to the processed documents directory (e.g., processed_00_documents_eval_s2pdf)")
 | |
|     parser.add_argument("deny_list", type=Path, help="Path to the deny list text file containing S3 paths to reject")
 | |
|     parser.add_argument("--rejected-dir", type=Path, default=None, help="Path to the rejected files directory (default: processed_dir_rejected)")
 | |
|     parser.add_argument("--dry-run", action="store_true", help="Show what would be moved without actually moving files")
 | |
| 
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     # Validate inputs
 | |
|     if not args.processed_dir.exists():
 | |
|         print(f"Error: Processed directory does not exist: {args.processed_dir}")
 | |
|         return 1
 | |
| 
 | |
|     if not args.deny_list.exists():
 | |
|         print(f"Error: Deny list file does not exist: {args.deny_list}")
 | |
|         return 1
 | |
| 
 | |
|     # Set rejected directory
 | |
|     if args.rejected_dir is None:
 | |
|         args.rejected_dir = args.processed_dir.parent / f"{args.processed_dir.name}_rejected"
 | |
| 
 | |
|     print(f"Processing directory: {args.processed_dir}")
 | |
|     print(f"Deny list file: {args.deny_list}")
 | |
|     print(f"Rejected directory: {args.rejected_dir}")
 | |
| 
 | |
|     if args.dry_run:
 | |
|         print("\n** DRY RUN MODE - No files will be moved **\n")
 | |
| 
 | |
|     # Parse deny list
 | |
|     deny_patterns = parse_deny_list(args.deny_list)
 | |
|     print(f"\nFound {len(deny_patterns)} unique deny patterns")
 | |
| 
 | |
|     # Find matching files
 | |
|     matching_files = find_matching_files(args.processed_dir, deny_patterns)
 | |
|     print(f"Found {len(matching_files)} files to remove")
 | |
| 
 | |
|     if not matching_files:
 | |
|         print("No files to remove.")
 | |
|         return 0
 | |
| 
 | |
|     # Show summary
 | |
|     print("\nFiles to be moved:")
 | |
|     for f in matching_files[:10]:  # Show first 10
 | |
|         print(f"  - {f}")
 | |
|     if len(matching_files) > 10:
 | |
|         print(f"  ... and {len(matching_files) - 10} more files")
 | |
| 
 | |
|     # Move files (or simulate in dry-run mode)
 | |
|     if not args.dry_run:
 | |
|         # Create rejected directory
 | |
|         args.rejected_dir.mkdir(parents=True, exist_ok=True)
 | |
| 
 | |
|         # Move the files
 | |
|         moved_count = move_files_to_rejected(matching_files, args.processed_dir, args.rejected_dir)
 | |
|         print(f"\nSuccessfully moved {moved_count} files to {args.rejected_dir}")
 | |
|     else:
 | |
|         print(f"\nDry run complete. Would move {len(matching_files)} files.")
 | |
| 
 | |
|     return 0
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     exit(main())
 | 
