Packaging working better now

2025-11-09 15:09:40 +00:00 · 2025-10-09 22:12:02 +00:00 · 2025-10-09 22:12:02 +00:00 · 702c42f8e7
commit 702c42f8e7
parent 557bb9a5e9
6 changed files with 84 additions and 66 deletions
--- a/olmocr/bench/miners/mine_multilingual_gpt.py
+++ b/olmocr/bench/miners/mine_multilingual_gpt.py
@ -20,13 +20,13 @@ from typing import Optional
 import boto3
 import pypdf
 from lingua import Language
 from openai import OpenAI
 from pydantic import BaseModel
 from tqdm import tqdm
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.filter import PdfFilter
 from lingua import Language
 TARGET_IMAGE_DIM = 1024
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
 # Tell pytest these are not tests
 __test__ = False
@dataclass
 class TableData:
    """Class to hold table data and metadata about headers."""
--- a/olmocr/data/prepare_olmocrmix.py
+++ b/olmocr/data/prepare_olmocrmix.py
@ -1,7 +1,7 @@
 import argparse
 import json
 import tarfile
 import shutil
 import tarfile
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from os import PathLike
 from pathlib import Path
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
                response_data = response
-                # Create folder structure using first 4 digits of id
+                # Create folder structure
-                # Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
+                # For allenai/olmOCR-mix-0225: use first 4 characters as folder
                # For other datasets: preserve the existing structure
                if dataset_path == "allenai/olmOCR-mix-0225":
                    # Standard format: use first 4 characters as folder
                    folder_name = doc_id[:4]
                    file_name = f"{doc_id[4:]}.md"
                    # Create directory
                    output_dir = processed_dir / folder_name
                    output_dir.mkdir(exist_ok=True)
                else:
                    # Custom format: preserve directory structure from doc_id
                    # The doc_id already contains the full path structure
                    if "/" in doc_id:
                        # doc_id contains path separators
                        path_parts = doc_id.rsplit("/", 1)
                        folder_path = Path(path_parts[0])
                        file_name = f"{path_parts[1]}.md"
                        output_dir = processed_dir / folder_path
                        output_dir.mkdir(parents=True, exist_ok=True)
                    else:
                        # No path separator, put at root
                        file_name = f"{doc_id}.md"
                        output_dir = processed_dir
                # Write markdown file with front matter and natural text
                output_file = output_dir / file_name
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
                matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
                assert matched_pdf_path.exists(), "Matching PDF not found"
                # Create symlink path based on dataset type
                if dataset_path == "allenai/olmOCR-mix-0225":
                    symlink_path = output_dir / f"{doc_id[4:]}.pdf"
                else:
                    # For custom datasets, use the same filename as the markdown
                    symlink_path = output_file.with_suffix(".pdf")
                # Create relative symlink to the PDF
                if not symlink_path.exists():
--- a/olmocr/data/repackage_olmocrmix.py
+++ b/olmocr/data/repackage_olmocrmix.py
@ -17,13 +17,14 @@ import tarfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Tuple
 from olmocr.prompts import PageResponse
 from olmocr.train.dataloader import FrontMatterParser
 import pandas as pd
 import yaml
 from tqdm import tqdm
 from olmocr.prompts import PageResponse
 from olmocr.train.dataloader import FrontMatterParser
 DEFAULT_MAX_TAR_BYTES = 1_073_741_824  # 1 GiB
@ -46,19 +47,13 @@ class DocumentRecord:
    pdf_relpath: Optional[str] = None
 def infer_doc_id(md_path: Path, processed_root: Path) -> str:
    """Reconstruct the doc_id used in parquet/index space."""
    rel = md_path.relative_to(processed_root)
-    if len(rel.parts) < 2:
+
-        stem = rel.stem
+    # Simply preserve the directory structure as the doc_id
-        prefix = rel.stem
+    # Convert path to doc_id by removing extension
-    else:
+    return str(rel.with_suffix(""))
        prefix = rel.parts[0]
        stem = Path(rel.parts[-1]).stem
    return f"{prefix}{stem}"
 def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
@ -249,9 +244,7 @@ def write_pdf_tarballs(
                rec.chunk_name = tar_name
                inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
                rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
-                manifest_rows.append(
+                manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
                    {"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
                )
        actual_size = tar_path.stat().st_size
        if actual_size > max_bytes:
--- a/scripts/hf_local_test.py
+++ b/scripts/hf_local_test.py
@ -1,8 +1,8 @@
 import torch
 import base64
 import urllib.request
 from io import BytesIO
 import torch
 from PIL import Image
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
@ -22,7 +22,6 @@ if __name__ == "__main__":
    # Render page 1 to an image
    image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
    # Build the full prompt
    messages = [
        {
@ -46,7 +45,6 @@ if __name__ == "__main__":
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}
    # Generate the output
    output = model.generate(
        **inputs,
@ -59,8 +57,6 @@ if __name__ == "__main__":
    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
-    text_output = processor.tokenizer.batch_decode(
+    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
        new_tokens, skip_special_tokens=True
    )
    print(text_output)
--- a/tests/test_olmocrmix.py
+++ b/tests/test_olmocrmix.py
@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
        repackage_result = subprocess.run(
            [
-                "python", "olmocr/data/repackage_olmocrmix.py",
+                "python",
-                "--processed-dir", str(sample_dataset),
+                "olmocr/data/repackage_olmocrmix.py",
-                "--subset", "test_subset",
+                "--processed-dir",
-                "--split", "test_split",
+                str(sample_dataset),
-                "--output-dir", str(packaged_dir)
+                "--subset",
                "test_subset",
                "--split",
                "test_split",
                "--output-dir",
                str(packaged_dir),
            ],
            capture_output=True,
-            text=True
+            text=True,
        )
        assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
        parquet_file = packaged_dir / "test_subset_test_split.parquet"
        assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
        # Step 2: Repackage the sample dataset into parquet + tarballs
        unpackaged_dir = temp_path / "unpackaged"
        prepare_result = subprocess.run(
            [
-                "python", "olmocr/data/prepare_olmocrmix.py",
+                "python",
-                "--dataset-path", str(packaged_dir),
+                "olmocr/data/prepare_olmocrmix.py",
-                "--subset", "test_subset",
+                "--dataset-path",
-                "--split", "test_split",
+                str(packaged_dir),
-                "--destination", str(unpackaged_dir)
+                "--subset",
                "test_subset",
                "--split",
                "test_split",
                "--destination",
                str(unpackaged_dir),
            ],
            capture_output=True,
-            text=True
+            text=True,
        )
        assert prepare_result.returncode == 0
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
        assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
        def relative_files(root: Path):
-            return sorted(
+            return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
                path.relative_to(root)
                for path in root.rglob("*")
                if path.is_file()
            )
        sample_files = relative_files(sample_dataset)
        unpacked_files = relative_files(unpacked_processed)