Packaging working better now

2025-11-04 03:56:16 +00:00 · 2025-10-09 22:12:02 +00:00 · 2025-10-09 22:12:02 +00:00 · 702c42f8e7
commit 702c42f8e7
parent 557bb9a5e9
6 changed files with 84 additions and 66 deletions
--- a/olmocr/bench/miners/mine_multilingual_gpt.py
+++ b/olmocr/bench/miners/mine_multilingual_gpt.py
@ -20,13 +20,13 @@ from typing import Optional

 import boto3
 import pypdf
+from lingua import Language
 from openai import OpenAI
 from pydantic import BaseModel
 from tqdm import tqdm

 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.filter import PdfFilter
-from lingua import Language

 TARGET_IMAGE_DIM = 1024

--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
 # Tell pytest these are not tests
 __test__ = False

+
@dataclass
 class TableData:
    """Class to hold table data and metadata about headers."""
--- a/olmocr/data/prepare_olmocrmix.py
+++ b/olmocr/data/prepare_olmocrmix.py
@ -1,7 +1,7 @@
 import argparse
 import json
-import tarfile
 import shutil
+import tarfile
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from os import PathLike
 from pathlib import Path
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:

                response_data = response

-                # Create folder structure using first 4 digits of id
-                # Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
+                # Create folder structure
+                # For allenai/olmOCR-mix-0225: use first 4 characters as folder
+                # For other datasets: preserve the existing structure
+
+                if dataset_path == "allenai/olmOCR-mix-0225":
+                    # Standard format: use first 4 characters as folder
                    folder_name = doc_id[:4]
                    file_name = f"{doc_id[4:]}.md"

                    # Create directory
                    output_dir = processed_dir / folder_name
                    output_dir.mkdir(exist_ok=True)
+                else:
+                    # Custom format: preserve directory structure from doc_id
+                    # The doc_id already contains the full path structure
+                    if "/" in doc_id:
+                        # doc_id contains path separators
+                        path_parts = doc_id.rsplit("/", 1)
+                        folder_path = Path(path_parts[0])
+                        file_name = f"{path_parts[1]}.md"
+                        output_dir = processed_dir / folder_path
+                        output_dir.mkdir(parents=True, exist_ok=True)
+                    else:
+                        # No path separator, put at root
+                        file_name = f"{doc_id}.md"
+                        output_dir = processed_dir

                # Write markdown file with front matter and natural text
                output_file = output_dir / file_name
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
                matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
                assert matched_pdf_path.exists(), "Matching PDF not found"

+                # Create symlink path based on dataset type
+                if dataset_path == "allenai/olmOCR-mix-0225":
                    symlink_path = output_dir / f"{doc_id[4:]}.pdf"
+                else:
+                    # For custom datasets, use the same filename as the markdown
+                    symlink_path = output_file.with_suffix(".pdf")

                # Create relative symlink to the PDF
                if not symlink_path.exists():
--- a/olmocr/data/repackage_olmocrmix.py
+++ b/olmocr/data/repackage_olmocrmix.py
@ -17,13 +17,14 @@ import tarfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Tuple
-from olmocr.prompts import PageResponse
-from olmocr.train.dataloader import FrontMatterParser

 import pandas as pd
 import yaml
 from tqdm import tqdm

+from olmocr.prompts import PageResponse
+from olmocr.train.dataloader import FrontMatterParser
+
 DEFAULT_MAX_TAR_BYTES = 1_073_741_824  # 1 GiB


@ -46,19 +47,13 @@ class DocumentRecord:
    pdf_relpath: Optional[str] = None


-
-
-
 def infer_doc_id(md_path: Path, processed_root: Path) -> str:
    """Reconstruct the doc_id used in parquet/index space."""
    rel = md_path.relative_to(processed_root)
-    if len(rel.parts) < 2:
-        stem = rel.stem
-        prefix = rel.stem
-    else:
-        prefix = rel.parts[0]
-        stem = Path(rel.parts[-1]).stem
-    return f"{prefix}{stem}"
+
+    # Simply preserve the directory structure as the doc_id
+    # Convert path to doc_id by removing extension
+    return str(rel.with_suffix(""))


 def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
@ -249,9 +244,7 @@ def write_pdf_tarballs(
                rec.chunk_name = tar_name
                inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
                rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
-                manifest_rows.append(
-                    {"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
-                )
+                manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})

        actual_size = tar_path.stat().st_size
        if actual_size > max_bytes:
--- a/scripts/hf_local_test.py
+++ b/scripts/hf_local_test.py
@ -1,8 +1,8 @@
-import torch
 import base64
 import urllib.request
-
 from io import BytesIO
+
+import torch
 from PIL import Image
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

@ -22,7 +22,6 @@ if __name__ == "__main__":
    # Render page 1 to an image
    image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)

-
    # Build the full prompt
    messages = [
        {
@ -46,7 +45,6 @@ if __name__ == "__main__":
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}

-
    # Generate the output
    output = model.generate(
        **inputs,
@ -59,8 +57,6 @@ if __name__ == "__main__":
    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
-    text_output = processor.tokenizer.batch_decode(
-        new_tokens, skip_special_tokens=True
-    )
+    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

    print(text_output)
--- a/tests/test_olmocrmix.py
+++ b/tests/test_olmocrmix.py
@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():

        repackage_result = subprocess.run(
            [
-                "python", "olmocr/data/repackage_olmocrmix.py",
-                "--processed-dir", str(sample_dataset),
-                "--subset", "test_subset",
-                "--split", "test_split",
-                "--output-dir", str(packaged_dir)
+                "python",
+                "olmocr/data/repackage_olmocrmix.py",
+                "--processed-dir",
+                str(sample_dataset),
+                "--subset",
+                "test_subset",
+                "--split",
+                "test_split",
+                "--output-dir",
+                str(packaged_dir),
            ],
            capture_output=True,
-            text=True
+            text=True,
        )

        assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
        parquet_file = packaged_dir / "test_subset_test_split.parquet"
        assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"

-        
        # Step 2: Repackage the sample dataset into parquet + tarballs
        unpackaged_dir = temp_path / "unpackaged"

        prepare_result = subprocess.run(
            [
-                "python", "olmocr/data/prepare_olmocrmix.py",
-                "--dataset-path", str(packaged_dir),
-                "--subset", "test_subset",
-                "--split", "test_split",
-                "--destination", str(unpackaged_dir)
+                "python",
+                "olmocr/data/prepare_olmocrmix.py",
+                "--dataset-path",
+                str(packaged_dir),
+                "--subset",
+                "test_subset",
+                "--split",
+                "test_split",
+                "--destination",
+                str(unpackaged_dir),
            ],
            capture_output=True,
-            text=True
+            text=True,
        )

        assert prepare_result.returncode == 0
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
        assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"

        def relative_files(root: Path):
-            return sorted(
-                path.relative_to(root)
-                for path in root.rglob("*")
-                if path.is_file()
-            )
+            return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())

        sample_files = relative_files(sample_dataset)
        unpacked_files = relative_files(unpacked_processed)