Adding a part of code to dataloader so you can see what is getting filtered out of your dataset

2025-10-18 11:42:55 +00:00 · 2025-08-19 18:45:01 +00:00 · 2025-08-19 18:45:01 +00:00 · a3d23d7de1
commit a3d23d7de1
parent 84a0c432e7
1 changed files with 106 additions and 0 deletions
--- a/olmocr/train/dataloader.py
+++ b/olmocr/train/dataloader.py
@ -974,6 +974,11 @@ if __name__ == "__main__":
        type=str,
        help="Save the processed image to the specified file path (e.g., output.png)",
    )
+    parser.add_argument(
+        "--save-filtered",
+        type=str,
+        help="Directory to save .md and .pdf files of filtered samples (samples that return None from pipeline)",
+    )

    args = parser.parse_args()

@ -1022,6 +1027,107 @@ if __name__ == "__main__":

    print(f"Dataset length: {len(dataset)}")

+    # Handle --save-filtered option
+    if args.save_filtered:
+        import shutil
+        from pathlib import Path
+        
+        save_dir = Path(args.save_filtered)
+        
+        # Clear and create directory
+        if save_dir.exists():
+            shutil.rmtree(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        
+        print(f"\n=== Checking for filtered samples ===")
+        print(f"Will save filtered samples to: {save_dir}")
+        
+        # Function to process and copy a single sample
+        def process_and_copy_sample(idx, dataset_samples, save_dir_str):
+            """Process a sample and return info if it's filtered.
+            
+            Note: This function needs to be picklable for ProcessPoolExecutor,
+            so it takes simple arguments rather than complex objects.
+            """
+            import shutil
+            from pathlib import Path
+            
+            # Recreate dataset with same parameters
+            # This is needed because dataset objects can't be pickled
+            temp_dataset = BaseMarkdownPDFDataset.__new__(BaseMarkdownPDFDataset)
+            temp_dataset.samples = dataset_samples
+            temp_dataset.pipeline_steps = pipeline_steps
+            
+            try:
+                sample = temp_dataset[idx]
+                if sample is None:
+                    # This sample was filtered out - get the original paths
+                    original_sample = dataset_samples[idx]
+                    md_path = original_sample['markdown_path']
+                    pdf_path = original_sample['pdf_path']
+                    
+                    save_dir = Path(save_dir_str)
+                    
+                    # Create subdirectory to preserve some structure
+                    # Use the parent directory name and file name
+                    rel_path = md_path.parent.name
+                    target_subdir = save_dir / rel_path
+                    target_subdir.mkdir(parents=True, exist_ok=True)
+                    
+                    # Copy markdown file
+                    target_md = target_subdir / md_path.name
+                    shutil.copy2(md_path, target_md)
+                    
+                    # Copy PDF file
+                    target_pdf = target_subdir / pdf_path.name
+                    shutil.copy2(pdf_path, target_pdf)
+                    
+                    return {
+                        'index': idx,
+                        'markdown_path': str(md_path),
+                        'pdf_path': str(pdf_path)
+                    }
+                return None
+            except Exception as e:
+                print(f"Error processing sample {idx}: {e}")
+                return None
+        
+        # Process all samples in parallel
+        filtered_samples = []
+        print(f"Processing {len(dataset)} samples to find and copy filtered ones...")
+        
+        with ProcessPoolExecutor(max_workers=8) as executor:
+            # Submit all tasks
+            futures = {
+                executor.submit(process_and_copy_sample, idx, dataset.samples, str(save_dir)): idx 
+                for idx in range(len(dataset))
+            }
+            
+            # Process results with progress bar
+            with tqdm(total=len(dataset), desc="Processing samples") as pbar:
+                for future in as_completed(futures):
+                    result = future.result()
+                    if result is not None:
+                        filtered_samples.append(result)
+                    pbar.update(1)
+        
+        # Sort filtered samples by index for consistent output
+        filtered_samples.sort(key=lambda x: x['index'])
+        
+        print(f"\nFound and copied {len(filtered_samples)} filtered samples to: {save_dir}")
+        
+        if filtered_samples:
+            print(f"First 10 filtered samples:")
+            for i, sample_info in enumerate(filtered_samples[:10]):
+                md_name = Path(sample_info['markdown_path']).name
+                print(f"  Sample {sample_info['index']}: {md_name}")
+            if len(filtered_samples) > 10:
+                print(f"  ... and {len(filtered_samples) - 10} more")
+        
+        # Exit early if --save-filtered is used (don't continue with other analyses)
+        print("\nCompleted saving filtered samples. Exiting.")
+        exit(0)
+
    if len(dataset) > 0:
        # Show first few samples
        print("\nFirst 5 samples:")