Packaging working better now

This commit is contained in:
Jake Poznanski 2025-10-09 22:12:02 +00:00
parent 557bb9a5e9
commit 702c42f8e7
6 changed files with 84 additions and 66 deletions

View File

@ -20,13 +20,13 @@ from typing import Optional
import boto3
import pypdf
from lingua import Language
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter import PdfFilter
from lingua import Language
TARGET_IMAGE_DIM = 1024

View File

@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
# Tell pytest these are not tests
__test__ = False
@dataclass
class TableData:
"""Class to hold table data and metadata about headers."""

View File

@ -1,7 +1,7 @@
import argparse
import json
import tarfile
import shutil
import tarfile
from concurrent.futures import ProcessPoolExecutor, as_completed
from os import PathLike
from pathlib import Path
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
response_data = response
# Create folder structure using first 4 digits of id
# Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
folder_name = doc_id[:4]
file_name = f"{doc_id[4:]}.md"
# Create folder structure
# For allenai/olmOCR-mix-0225: use first 4 characters as folder
# For other datasets: preserve the existing structure
# Create directory
output_dir = processed_dir / folder_name
output_dir.mkdir(exist_ok=True)
if dataset_path == "allenai/olmOCR-mix-0225":
# Standard format: use first 4 characters as folder
folder_name = doc_id[:4]
file_name = f"{doc_id[4:]}.md"
# Create directory
output_dir = processed_dir / folder_name
output_dir.mkdir(exist_ok=True)
else:
# Custom format: preserve directory structure from doc_id
# The doc_id already contains the full path structure
if "/" in doc_id:
# doc_id contains path separators
path_parts = doc_id.rsplit("/", 1)
folder_path = Path(path_parts[0])
file_name = f"{path_parts[1]}.md"
output_dir = processed_dir / folder_path
output_dir.mkdir(parents=True, exist_ok=True)
else:
# No path separator, put at root
file_name = f"{doc_id}.md"
output_dir = processed_dir
# Write markdown file with front matter and natural text
output_file = output_dir / file_name
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
assert matched_pdf_path.exists(), "Matching PDF not found"
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
# Create symlink path based on dataset type
if dataset_path == "allenai/olmOCR-mix-0225":
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
else:
# For custom datasets, use the same filename as the markdown
symlink_path = output_file.with_suffix(".pdf")
# Create relative symlink to the PDF
if not symlink_path.exists():

View File

@ -17,13 +17,14 @@ import tarfile
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple
from olmocr.prompts import PageResponse
from olmocr.train.dataloader import FrontMatterParser
import pandas as pd
import yaml
from tqdm import tqdm
from olmocr.prompts import PageResponse
from olmocr.train.dataloader import FrontMatterParser
DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB
@ -46,19 +47,13 @@ class DocumentRecord:
pdf_relpath: Optional[str] = None
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
"""Reconstruct the doc_id used in parquet/index space."""
rel = md_path.relative_to(processed_root)
if len(rel.parts) < 2:
stem = rel.stem
prefix = rel.stem
else:
prefix = rel.parts[0]
stem = Path(rel.parts[-1]).stem
return f"{prefix}{stem}"
# Simply preserve the directory structure as the doc_id
# Convert path to doc_id by removing extension
return str(rel.with_suffix(""))
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
@ -249,9 +244,7 @@ def write_pdf_tarballs(
rec.chunk_name = tar_name
inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
manifest_rows.append(
{"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
)
manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
actual_size = tar_path.stat().st_size
if actual_size > max_bytes:

View File

@ -1,8 +1,8 @@
import torch
import base64
import urllib.request
from io import BytesIO
import torch
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
@ -22,17 +22,16 @@ if __name__ == "__main__":
# Render page 1 to an image
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
# Build the full prompt
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
{
"role": "user",
"content": [
{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@ -46,21 +45,18 @@ if __name__ == "__main__":
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}
# Generate the output
output = model.generate(
**inputs,
temperature=0.1,
max_new_tokens=50,
num_return_sequences=1,
do_sample=True,
)
**inputs,
temperature=0.1,
max_new_tokens=50,
num_return_sequences=1,
do_sample=True,
)
# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
new_tokens, skip_special_tokens=True
)
text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
print(text_output)
print(text_output)

View File

@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
repackage_result = subprocess.run(
[
"python", "olmocr/data/repackage_olmocrmix.py",
"--processed-dir", str(sample_dataset),
"--subset", "test_subset",
"--split", "test_split",
"--output-dir", str(packaged_dir)
"python",
"olmocr/data/repackage_olmocrmix.py",
"--processed-dir",
str(sample_dataset),
"--subset",
"test_subset",
"--split",
"test_split",
"--output-dir",
str(packaged_dir),
],
capture_output=True,
text=True
text=True,
)
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
parquet_file = packaged_dir / "test_subset_test_split.parquet"
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
# Step 2: Repackage the sample dataset into parquet + tarballs
unpackaged_dir = temp_path / "unpackaged"
prepare_result = subprocess.run(
[
"python", "olmocr/data/prepare_olmocrmix.py",
"--dataset-path", str(packaged_dir),
"--subset", "test_subset",
"--split", "test_split",
"--destination", str(unpackaged_dir)
"python",
"olmocr/data/prepare_olmocrmix.py",
"--dataset-path",
str(packaged_dir),
"--subset",
"test_subset",
"--split",
"test_split",
"--destination",
str(unpackaged_dir),
],
capture_output=True,
text=True
text=True,
)
assert prepare_result.returncode == 0
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
def relative_files(root: Path):
return sorted(
path.relative_to(root)
for path in root.rglob("*")
if path.is_file()
)
return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
sample_files = relative_files(sample_dataset)
unpacked_files = relative_files(unpacked_processed)