Ok, looks like we have a nice extractor script for the dataset

2025-11-14 09:29:32 +00:00 · 2025-06-11 17:28:00 +00:00 · 2025-06-11 17:28:00 +00:00 · 0e17b50583
commit 0e17b50583
parent f19f7c1271
1 changed files with 44 additions and 26 deletions
--- a/olmocr/train/prepare_olmocrmix.py
+++ b/olmocr/train/prepare_olmocrmix.py
@ -72,34 +72,39 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
        extracted_dir = pdf_tarballs_dir / "extracted"
        extracted_dir.mkdir(exist_ok=True)
-        # Find all tarball files
+        # Check if PDFs are already extracted
-        tarball_files = list(pdf_tarballs_dir.glob("*.tar*")) + list(pdf_tarballs_dir.glob("*.tgz"))
+        existing_pdfs = list(extracted_dir.glob("*.pdf"))
        if existing_pdfs:
            print(f"Found {len(existing_pdfs)} already extracted PDFs in {extracted_dir}, skipping extraction step")
        else:
            # Find all tarball files
            tarball_files = list(pdf_tarballs_dir.glob("*.tar*")) + list(pdf_tarballs_dir.glob("*.tgz"))
-        if tarball_files:
+            if tarball_files:
-            print(f"\nFound {len(tarball_files)} PDF tarballs to extract...")
+                print(f"\nFound {len(tarball_files)} PDF tarballs to extract...")
-            # Use ProcessPoolExecutor for parallel extraction
+                # Use ProcessPoolExecutor for parallel extraction
-            with ProcessPoolExecutor() as executor:
+                with ProcessPoolExecutor() as executor:
-                # Submit all tasks
+                    # Submit all tasks
-                future_to_tarball = {}
+                    future_to_tarball = {}
-                for tarball in tarball_files:
+                    for tarball in tarball_files:
-                    future = executor.submit(extract_tarball, tarball, extracted_dir)
+                        future = executor.submit(extract_tarball, tarball, extracted_dir)
-                    future_to_tarball[future] = tarball
+                        future_to_tarball[future] = tarball
-                # Process results as they complete with progress bar
+                    # Process results as they complete with progress bar
-                total_files_extracted = 0
+                    total_files_extracted = 0
-                with tqdm(total=len(tarball_files), desc="Extracting tarballs") as pbar:
+                    with tqdm(total=len(tarball_files), desc="Extracting tarballs") as pbar:
-                    for future in as_completed(future_to_tarball):
+                        for future in as_completed(future_to_tarball):
-                        tarball = future_to_tarball[future]
+                            tarball = future_to_tarball[future]
-                        try:
+                            try:
-                            files_extracted = future.result()
+                                files_extracted = future.result()
-                            total_files_extracted += files_extracted
+                                total_files_extracted += files_extracted
-                            pbar.set_postfix({"files": total_files_extracted})
+                                pbar.set_postfix({"files": total_files_extracted})
-                        except Exception as e:
+                            except Exception as e:
-                            print(f"\nError with {tarball.name}: {e}")
+                                print(f"\nError with {tarball.name}: {e}")
-                        pbar.update(1)
+                            pbar.update(1)
-            print(f"Extracted {total_files_extracted} files from tarballs to {extracted_dir}")
+                print(f"Extracted {total_files_extracted} files from tarballs to {extracted_dir}")
    else:
        print(f"No PDF tarballs directory found at {pdf_tarballs_dir}")
@ -155,6 +160,19 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
                    # Write natural text
                    f.write(natural_text)
                # Look for matching PDF in extracted directory and create symlinks
                extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
                # Find PDFs that match the ID pattern
                matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
                assert matched_pdf_path.exists(), "Matching PDF not found"
                symlink_path = output_dir / f"{doc_id[4:]}.pdf"
                # Create relative symlink to the PDF
                if not symlink_path.exists():
                    symlink_path.symlink_to(matched_pdf_path)
                total_processed += 1
                if total_processed % 1000 == 0:
                    print(f"Processed {total_processed} examples...")