Ok, looks like we have a nice extractor script for the dataset

2025-11-04 12:07:15 +00:00 · 2025-06-11 17:28:00 +00:00 · 2025-06-11 17:28:00 +00:00 · 0e17b50583
commit 0e17b50583
parent f19f7c1271
1 changed files with 44 additions and 26 deletions
--- a/olmocr/train/prepare_olmocrmix.py
+++ b/olmocr/train/prepare_olmocrmix.py
@ -72,6 +72,11 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
        extracted_dir = pdf_tarballs_dir / "extracted"
        extracted_dir.mkdir(exist_ok=True)
        
+        # Check if PDFs are already extracted
+        existing_pdfs = list(extracted_dir.glob("*.pdf"))
+        if existing_pdfs:
+            print(f"Found {len(existing_pdfs)} already extracted PDFs in {extracted_dir}, skipping extraction step")
+        else:
            # Find all tarball files
            tarball_files = list(pdf_tarballs_dir.glob("*.tar*")) + list(pdf_tarballs_dir.glob("*.tgz"))
            
@ -155,6 +160,19 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
                    # Write natural text
                    f.write(natural_text)
            
+                # Look for matching PDF in extracted directory and create symlinks
+                extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
+
+                # Find PDFs that match the ID pattern
+                matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
+                assert matched_pdf_path.exists(), "Matching PDF not found"
+
+                symlink_path = output_dir / f"{doc_id[4:]}.pdf"
+                
+                # Create relative symlink to the PDF
+                if not symlink_path.exists():
+                    symlink_path.symlink_to(matched_pdf_path)
+                
                total_processed += 1
                if total_processed % 1000 == 0:
                    print(f"Processed {total_processed} examples...")