mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-04 03:56:16 +00:00 
			
		
		
		
	Packaging working better now
This commit is contained in:
		
							parent
							
								
									557bb9a5e9
								
							
						
					
					
						commit
						702c42f8e7
					
				@ -20,13 +20,13 @@ from typing import Optional
 | 
			
		||||
 | 
			
		||||
import boto3
 | 
			
		||||
import pypdf
 | 
			
		||||
from lingua import Language
 | 
			
		||||
from openai import OpenAI
 | 
			
		||||
from pydantic import BaseModel
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
 | 
			
		||||
from olmocr.data.renderpdf import render_pdf_to_base64png
 | 
			
		||||
from olmocr.filter import PdfFilter
 | 
			
		||||
from lingua import Language
 | 
			
		||||
 | 
			
		||||
TARGET_IMAGE_DIM = 1024
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
 | 
			
		||||
# Tell pytest these are not tests
 | 
			
		||||
__test__ = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class TableData:
 | 
			
		||||
    """Class to hold table data and metadata about headers."""
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,7 @@
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
import tarfile
 | 
			
		||||
import shutil
 | 
			
		||||
import tarfile
 | 
			
		||||
from concurrent.futures import ProcessPoolExecutor, as_completed
 | 
			
		||||
from os import PathLike
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
 | 
			
		||||
 | 
			
		||||
                response_data = response
 | 
			
		||||
 | 
			
		||||
                # Create folder structure using first 4 digits of id
 | 
			
		||||
                # Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
 | 
			
		||||
                # Create folder structure
 | 
			
		||||
                # For allenai/olmOCR-mix-0225: use first 4 characters as folder
 | 
			
		||||
                # For other datasets: preserve the existing structure
 | 
			
		||||
 | 
			
		||||
                if dataset_path == "allenai/olmOCR-mix-0225":
 | 
			
		||||
                    # Standard format: use first 4 characters as folder
 | 
			
		||||
                    folder_name = doc_id[:4]
 | 
			
		||||
                    file_name = f"{doc_id[4:]}.md"
 | 
			
		||||
 | 
			
		||||
                    # Create directory
 | 
			
		||||
                    output_dir = processed_dir / folder_name
 | 
			
		||||
                    output_dir.mkdir(exist_ok=True)
 | 
			
		||||
                else:
 | 
			
		||||
                    # Custom format: preserve directory structure from doc_id
 | 
			
		||||
                    # The doc_id already contains the full path structure
 | 
			
		||||
                    if "/" in doc_id:
 | 
			
		||||
                        # doc_id contains path separators
 | 
			
		||||
                        path_parts = doc_id.rsplit("/", 1)
 | 
			
		||||
                        folder_path = Path(path_parts[0])
 | 
			
		||||
                        file_name = f"{path_parts[1]}.md"
 | 
			
		||||
                        output_dir = processed_dir / folder_path
 | 
			
		||||
                        output_dir.mkdir(parents=True, exist_ok=True)
 | 
			
		||||
                    else:
 | 
			
		||||
                        # No path separator, put at root
 | 
			
		||||
                        file_name = f"{doc_id}.md"
 | 
			
		||||
                        output_dir = processed_dir
 | 
			
		||||
 | 
			
		||||
                # Write markdown file with front matter and natural text
 | 
			
		||||
                output_file = output_dir / file_name
 | 
			
		||||
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
 | 
			
		||||
                matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
 | 
			
		||||
                assert matched_pdf_path.exists(), "Matching PDF not found"
 | 
			
		||||
 | 
			
		||||
                # Create symlink path based on dataset type
 | 
			
		||||
                if dataset_path == "allenai/olmOCR-mix-0225":
 | 
			
		||||
                    symlink_path = output_dir / f"{doc_id[4:]}.pdf"
 | 
			
		||||
                else:
 | 
			
		||||
                    # For custom datasets, use the same filename as the markdown
 | 
			
		||||
                    symlink_path = output_file.with_suffix(".pdf")
 | 
			
		||||
 | 
			
		||||
                # Create relative symlink to the PDF
 | 
			
		||||
                if not symlink_path.exists():
 | 
			
		||||
 | 
			
		||||
@ -17,13 +17,14 @@ import tarfile
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Dict, Iterator, List, Optional, Tuple
 | 
			
		||||
from olmocr.prompts import PageResponse
 | 
			
		||||
from olmocr.train.dataloader import FrontMatterParser
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import yaml
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
 | 
			
		||||
from olmocr.prompts import PageResponse
 | 
			
		||||
from olmocr.train.dataloader import FrontMatterParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MAX_TAR_BYTES = 1_073_741_824  # 1 GiB
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,19 +47,13 @@ class DocumentRecord:
 | 
			
		||||
    pdf_relpath: Optional[str] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
 | 
			
		||||
    """Reconstruct the doc_id used in parquet/index space."""
 | 
			
		||||
    rel = md_path.relative_to(processed_root)
 | 
			
		||||
    if len(rel.parts) < 2:
 | 
			
		||||
        stem = rel.stem
 | 
			
		||||
        prefix = rel.stem
 | 
			
		||||
    else:
 | 
			
		||||
        prefix = rel.parts[0]
 | 
			
		||||
        stem = Path(rel.parts[-1]).stem
 | 
			
		||||
    return f"{prefix}{stem}"
 | 
			
		||||
 | 
			
		||||
    # Simply preserve the directory structure as the doc_id
 | 
			
		||||
    # Convert path to doc_id by removing extension
 | 
			
		||||
    return str(rel.with_suffix(""))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
 | 
			
		||||
@ -249,9 +244,7 @@ def write_pdf_tarballs(
 | 
			
		||||
                rec.chunk_name = tar_name
 | 
			
		||||
                inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
 | 
			
		||||
                rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
 | 
			
		||||
                manifest_rows.append(
 | 
			
		||||
                    {"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
 | 
			
		||||
                )
 | 
			
		||||
                manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
 | 
			
		||||
 | 
			
		||||
        actual_size = tar_path.stat().st_size
 | 
			
		||||
        if actual_size > max_bytes:
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,8 @@
 | 
			
		||||
import torch
 | 
			
		||||
import base64
 | 
			
		||||
import urllib.request
 | 
			
		||||
 | 
			
		||||
from io import BytesIO
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
@ -22,7 +22,6 @@ if __name__ == "__main__":
 | 
			
		||||
    # Render page 1 to an image
 | 
			
		||||
    image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # Build the full prompt
 | 
			
		||||
    messages = [
 | 
			
		||||
        {
 | 
			
		||||
@ -46,7 +45,6 @@ if __name__ == "__main__":
 | 
			
		||||
    )
 | 
			
		||||
    inputs = {key: value.to(device) for (key, value) in inputs.items()}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # Generate the output
 | 
			
		||||
    output = model.generate(
 | 
			
		||||
        **inputs,
 | 
			
		||||
@ -59,8 +57,6 @@ if __name__ == "__main__":
 | 
			
		||||
    # Decode the output
 | 
			
		||||
    prompt_length = inputs["input_ids"].shape[1]
 | 
			
		||||
    new_tokens = output[:, prompt_length:]
 | 
			
		||||
    text_output = processor.tokenizer.batch_decode(
 | 
			
		||||
        new_tokens, skip_special_tokens=True
 | 
			
		||||
    )
 | 
			
		||||
    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
 | 
			
		||||
 | 
			
		||||
    print(text_output)
 | 
			
		||||
@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
 | 
			
		||||
 | 
			
		||||
        repackage_result = subprocess.run(
 | 
			
		||||
            [
 | 
			
		||||
                "python", "olmocr/data/repackage_olmocrmix.py",
 | 
			
		||||
                "--processed-dir", str(sample_dataset),
 | 
			
		||||
                "--subset", "test_subset",
 | 
			
		||||
                "--split", "test_split",
 | 
			
		||||
                "--output-dir", str(packaged_dir)
 | 
			
		||||
                "python",
 | 
			
		||||
                "olmocr/data/repackage_olmocrmix.py",
 | 
			
		||||
                "--processed-dir",
 | 
			
		||||
                str(sample_dataset),
 | 
			
		||||
                "--subset",
 | 
			
		||||
                "test_subset",
 | 
			
		||||
                "--split",
 | 
			
		||||
                "test_split",
 | 
			
		||||
                "--output-dir",
 | 
			
		||||
                str(packaged_dir),
 | 
			
		||||
            ],
 | 
			
		||||
            capture_output=True,
 | 
			
		||||
            text=True
 | 
			
		||||
            text=True,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
 | 
			
		||||
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
 | 
			
		||||
        parquet_file = packaged_dir / "test_subset_test_split.parquet"
 | 
			
		||||
        assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
        # Step 2: Repackage the sample dataset into parquet + tarballs
 | 
			
		||||
        unpackaged_dir = temp_path / "unpackaged"
 | 
			
		||||
 | 
			
		||||
        prepare_result = subprocess.run(
 | 
			
		||||
            [
 | 
			
		||||
                "python", "olmocr/data/prepare_olmocrmix.py",
 | 
			
		||||
                "--dataset-path", str(packaged_dir),
 | 
			
		||||
                "--subset", "test_subset",
 | 
			
		||||
                "--split", "test_split",
 | 
			
		||||
                "--destination", str(unpackaged_dir)
 | 
			
		||||
                "python",
 | 
			
		||||
                "olmocr/data/prepare_olmocrmix.py",
 | 
			
		||||
                "--dataset-path",
 | 
			
		||||
                str(packaged_dir),
 | 
			
		||||
                "--subset",
 | 
			
		||||
                "test_subset",
 | 
			
		||||
                "--split",
 | 
			
		||||
                "test_split",
 | 
			
		||||
                "--destination",
 | 
			
		||||
                str(unpackaged_dir),
 | 
			
		||||
            ],
 | 
			
		||||
            capture_output=True,
 | 
			
		||||
            text=True
 | 
			
		||||
            text=True,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        assert prepare_result.returncode == 0
 | 
			
		||||
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
 | 
			
		||||
        assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
 | 
			
		||||
 | 
			
		||||
        def relative_files(root: Path):
 | 
			
		||||
            return sorted(
 | 
			
		||||
                path.relative_to(root)
 | 
			
		||||
                for path in root.rglob("*")
 | 
			
		||||
                if path.is_file()
 | 
			
		||||
            )
 | 
			
		||||
            return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
 | 
			
		||||
 | 
			
		||||
        sample_files = relative_files(sample_dataset)
 | 
			
		||||
        unpacked_files = relative_files(unpacked_processed)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user