Packaging working better now

This commit is contained in:
Jake Poznanski 2025-10-09 22:12:02 +00:00
parent 557bb9a5e9
commit 702c42f8e7
6 changed files with 84 additions and 66 deletions

View File

@ -20,13 +20,13 @@ from typing import Optional
import boto3 import boto3
import pypdf import pypdf
from lingua import Language
from openai import OpenAI from openai import OpenAI
from pydantic import BaseModel from pydantic import BaseModel
from tqdm import tqdm from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter import PdfFilter from olmocr.filter import PdfFilter
from lingua import Language
TARGET_IMAGE_DIM = 1024 TARGET_IMAGE_DIM = 1024

View File

@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
# Tell pytest these are not tests # Tell pytest these are not tests
__test__ = False __test__ = False
@dataclass @dataclass
class TableData: class TableData:
"""Class to hold table data and metadata about headers.""" """Class to hold table data and metadata about headers."""

View File

@ -1,7 +1,7 @@
import argparse import argparse
import json import json
import tarfile
import shutil import shutil
import tarfile
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, as_completed
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
response_data = response response_data = response
# Create folder structure using first 4 digits of id # Create folder structure
# Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md # For allenai/olmOCR-mix-0225: use first 4 characters as folder
# For other datasets: preserve the existing structure
if dataset_path == "allenai/olmOCR-mix-0225":
# Standard format: use first 4 characters as folder
folder_name = doc_id[:4] folder_name = doc_id[:4]
file_name = f"{doc_id[4:]}.md" file_name = f"{doc_id[4:]}.md"
# Create directory # Create directory
output_dir = processed_dir / folder_name output_dir = processed_dir / folder_name
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
else:
# Custom format: preserve directory structure from doc_id
# The doc_id already contains the full path structure
if "/" in doc_id:
# doc_id contains path separators
path_parts = doc_id.rsplit("/", 1)
folder_path = Path(path_parts[0])
file_name = f"{path_parts[1]}.md"
output_dir = processed_dir / folder_path
output_dir.mkdir(parents=True, exist_ok=True)
else:
# No path separator, put at root
file_name = f"{doc_id}.md"
output_dir = processed_dir
# Write markdown file with front matter and natural text # Write markdown file with front matter and natural text
output_file = output_dir / file_name output_file = output_dir / file_name
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf" matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
assert matched_pdf_path.exists(), "Matching PDF not found" assert matched_pdf_path.exists(), "Matching PDF not found"
# Create symlink path based on dataset type
if dataset_path == "allenai/olmOCR-mix-0225":
symlink_path = output_dir / f"{doc_id[4:]}.pdf" symlink_path = output_dir / f"{doc_id[4:]}.pdf"
else:
# For custom datasets, use the same filename as the markdown
symlink_path = output_file.with_suffix(".pdf")
# Create relative symlink to the PDF # Create relative symlink to the PDF
if not symlink_path.exists(): if not symlink_path.exists():

View File

@ -17,13 +17,14 @@ import tarfile
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple from typing import Dict, Iterator, List, Optional, Tuple
from olmocr.prompts import PageResponse
from olmocr.train.dataloader import FrontMatterParser
import pandas as pd import pandas as pd
import yaml import yaml
from tqdm import tqdm from tqdm import tqdm
from olmocr.prompts import PageResponse
from olmocr.train.dataloader import FrontMatterParser
DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB
@ -46,19 +47,13 @@ class DocumentRecord:
pdf_relpath: Optional[str] = None pdf_relpath: Optional[str] = None
def infer_doc_id(md_path: Path, processed_root: Path) -> str: def infer_doc_id(md_path: Path, processed_root: Path) -> str:
"""Reconstruct the doc_id used in parquet/index space.""" """Reconstruct the doc_id used in parquet/index space."""
rel = md_path.relative_to(processed_root) rel = md_path.relative_to(processed_root)
if len(rel.parts) < 2:
stem = rel.stem # Simply preserve the directory structure as the doc_id
prefix = rel.stem # Convert path to doc_id by removing extension
else: return str(rel.with_suffix(""))
prefix = rel.parts[0]
stem = Path(rel.parts[-1]).stem
return f"{prefix}{stem}"
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path: def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
@ -249,9 +244,7 @@ def write_pdf_tarballs(
rec.chunk_name = tar_name rec.chunk_name = tar_name
inner_ref = f"{tar_name}:{rec.doc_id}.pdf" inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
manifest_rows.append( manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
{"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
)
actual_size = tar_path.stat().st_size actual_size = tar_path.stat().st_size
if actual_size > max_bytes: if actual_size > max_bytes:

View File

@ -1,8 +1,8 @@
import torch
import base64 import base64
import urllib.request import urllib.request
from io import BytesIO from io import BytesIO
import torch
from PIL import Image from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
@ -22,7 +22,6 @@ if __name__ == "__main__":
# Render page 1 to an image # Render page 1 to an image
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288) image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
# Build the full prompt # Build the full prompt
messages = [ messages = [
{ {
@ -46,7 +45,6 @@ if __name__ == "__main__":
) )
inputs = {key: value.to(device) for (key, value) in inputs.items()} inputs = {key: value.to(device) for (key, value) in inputs.items()}
# Generate the output # Generate the output
output = model.generate( output = model.generate(
**inputs, **inputs,
@ -59,8 +57,6 @@ if __name__ == "__main__":
# Decode the output # Decode the output
prompt_length = inputs["input_ids"].shape[1] prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:] new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode( text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
new_tokens, skip_special_tokens=True
)
print(text_output) print(text_output)

View File

@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
repackage_result = subprocess.run( repackage_result = subprocess.run(
[ [
"python", "olmocr/data/repackage_olmocrmix.py", "python",
"--processed-dir", str(sample_dataset), "olmocr/data/repackage_olmocrmix.py",
"--subset", "test_subset", "--processed-dir",
"--split", "test_split", str(sample_dataset),
"--output-dir", str(packaged_dir) "--subset",
"test_subset",
"--split",
"test_split",
"--output-dir",
str(packaged_dir),
], ],
capture_output=True, capture_output=True,
text=True text=True,
) )
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}" assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
parquet_file = packaged_dir / "test_subset_test_split.parquet" parquet_file = packaged_dir / "test_subset_test_split.parquet"
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}" assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
# Step 2: Repackage the sample dataset into parquet + tarballs # Step 2: Repackage the sample dataset into parquet + tarballs
unpackaged_dir = temp_path / "unpackaged" unpackaged_dir = temp_path / "unpackaged"
prepare_result = subprocess.run( prepare_result = subprocess.run(
[ [
"python", "olmocr/data/prepare_olmocrmix.py", "python",
"--dataset-path", str(packaged_dir), "olmocr/data/prepare_olmocrmix.py",
"--subset", "test_subset", "--dataset-path",
"--split", "test_split", str(packaged_dir),
"--destination", str(unpackaged_dir) "--subset",
"test_subset",
"--split",
"test_split",
"--destination",
str(unpackaged_dir),
], ],
capture_output=True, capture_output=True,
text=True text=True,
) )
assert prepare_result.returncode == 0 assert prepare_result.returncode == 0
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}" assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
def relative_files(root: Path): def relative_files(root: Path):
return sorted( return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
path.relative_to(root)
for path in root.rglob("*")
if path.is_file()
)
sample_files = relative_files(sample_dataset) sample_files = relative_files(sample_dataset)
unpacked_files = relative_files(unpacked_processed) unpacked_files = relative_files(unpacked_processed)