mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 19:13:53 +00:00
Packaging working better now
This commit is contained in:
parent
557bb9a5e9
commit
702c42f8e7
@ -20,13 +20,13 @@ from typing import Optional
|
||||
|
||||
import boto3
|
||||
import pypdf
|
||||
from lingua import Language
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from olmocr.filter import PdfFilter
|
||||
from lingua import Language
|
||||
|
||||
TARGET_IMAGE_DIM = 1024
|
||||
|
||||
|
||||
@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
|
||||
# Tell pytest these are not tests
|
||||
__test__ = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableData:
|
||||
"""Class to hold table data and metadata about headers."""
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import argparse
|
||||
import json
|
||||
import tarfile
|
||||
import shutil
|
||||
import tarfile
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
|
||||
response_data = response
|
||||
|
||||
# Create folder structure using first 4 digits of id
|
||||
# Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
|
||||
folder_name = doc_id[:4]
|
||||
file_name = f"{doc_id[4:]}.md"
|
||||
# Create folder structure
|
||||
# For allenai/olmOCR-mix-0225: use first 4 characters as folder
|
||||
# For other datasets: preserve the existing structure
|
||||
|
||||
# Create directory
|
||||
output_dir = processed_dir / folder_name
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
if dataset_path == "allenai/olmOCR-mix-0225":
|
||||
# Standard format: use first 4 characters as folder
|
||||
folder_name = doc_id[:4]
|
||||
file_name = f"{doc_id[4:]}.md"
|
||||
|
||||
# Create directory
|
||||
output_dir = processed_dir / folder_name
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
else:
|
||||
# Custom format: preserve directory structure from doc_id
|
||||
# The doc_id already contains the full path structure
|
||||
if "/" in doc_id:
|
||||
# doc_id contains path separators
|
||||
path_parts = doc_id.rsplit("/", 1)
|
||||
folder_path = Path(path_parts[0])
|
||||
file_name = f"{path_parts[1]}.md"
|
||||
output_dir = processed_dir / folder_path
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
# No path separator, put at root
|
||||
file_name = f"{doc_id}.md"
|
||||
output_dir = processed_dir
|
||||
|
||||
# Write markdown file with front matter and natural text
|
||||
output_file = output_dir / file_name
|
||||
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
|
||||
assert matched_pdf_path.exists(), "Matching PDF not found"
|
||||
|
||||
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
|
||||
# Create symlink path based on dataset type
|
||||
if dataset_path == "allenai/olmOCR-mix-0225":
|
||||
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
|
||||
else:
|
||||
# For custom datasets, use the same filename as the markdown
|
||||
symlink_path = output_file.with_suffix(".pdf")
|
||||
|
||||
# Create relative symlink to the PDF
|
||||
if not symlink_path.exists():
|
||||
|
||||
@ -17,13 +17,14 @@ import tarfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional, Tuple
|
||||
from olmocr.prompts import PageResponse
|
||||
from olmocr.train.dataloader import FrontMatterParser
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.prompts import PageResponse
|
||||
from olmocr.train.dataloader import FrontMatterParser
|
||||
|
||||
DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB
|
||||
|
||||
|
||||
@ -46,19 +47,13 @@ class DocumentRecord:
|
||||
pdf_relpath: Optional[str] = None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
|
||||
"""Reconstruct the doc_id used in parquet/index space."""
|
||||
rel = md_path.relative_to(processed_root)
|
||||
if len(rel.parts) < 2:
|
||||
stem = rel.stem
|
||||
prefix = rel.stem
|
||||
else:
|
||||
prefix = rel.parts[0]
|
||||
stem = Path(rel.parts[-1]).stem
|
||||
return f"{prefix}{stem}"
|
||||
|
||||
# Simply preserve the directory structure as the doc_id
|
||||
# Convert path to doc_id by removing extension
|
||||
return str(rel.with_suffix(""))
|
||||
|
||||
|
||||
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
|
||||
@ -249,9 +244,7 @@ def write_pdf_tarballs(
|
||||
rec.chunk_name = tar_name
|
||||
inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
|
||||
rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
|
||||
manifest_rows.append(
|
||||
{"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
|
||||
)
|
||||
manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
|
||||
|
||||
actual_size = tar_path.stat().st_size
|
||||
if actual_size > max_bytes:
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
import torch
|
||||
import base64
|
||||
import urllib.request
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||
|
||||
@ -22,17 +22,16 @@ if __name__ == "__main__":
|
||||
# Render page 1 to an image
|
||||
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
|
||||
|
||||
|
||||
# Build the full prompt
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||
],
|
||||
}
|
||||
]
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Apply the chat template and processor
|
||||
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
@ -46,21 +45,18 @@ if __name__ == "__main__":
|
||||
)
|
||||
inputs = {key: value.to(device) for (key, value) in inputs.items()}
|
||||
|
||||
|
||||
# Generate the output
|
||||
output = model.generate(
|
||||
**inputs,
|
||||
temperature=0.1,
|
||||
max_new_tokens=50,
|
||||
num_return_sequences=1,
|
||||
do_sample=True,
|
||||
)
|
||||
**inputs,
|
||||
temperature=0.1,
|
||||
max_new_tokens=50,
|
||||
num_return_sequences=1,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
# Decode the output
|
||||
prompt_length = inputs["input_ids"].shape[1]
|
||||
new_tokens = output[:, prompt_length:]
|
||||
text_output = processor.tokenizer.batch_decode(
|
||||
new_tokens, skip_special_tokens=True
|
||||
)
|
||||
text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
||||
|
||||
print(text_output)
|
||||
print(text_output)
|
||||
|
||||
@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
|
||||
|
||||
repackage_result = subprocess.run(
|
||||
[
|
||||
"python", "olmocr/data/repackage_olmocrmix.py",
|
||||
"--processed-dir", str(sample_dataset),
|
||||
"--subset", "test_subset",
|
||||
"--split", "test_split",
|
||||
"--output-dir", str(packaged_dir)
|
||||
"python",
|
||||
"olmocr/data/repackage_olmocrmix.py",
|
||||
"--processed-dir",
|
||||
str(sample_dataset),
|
||||
"--subset",
|
||||
"test_subset",
|
||||
"--split",
|
||||
"test_split",
|
||||
"--output-dir",
|
||||
str(packaged_dir),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
|
||||
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
|
||||
parquet_file = packaged_dir / "test_subset_test_split.parquet"
|
||||
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
|
||||
|
||||
|
||||
# Step 2: Repackage the sample dataset into parquet + tarballs
|
||||
unpackaged_dir = temp_path / "unpackaged"
|
||||
|
||||
|
||||
prepare_result = subprocess.run(
|
||||
[
|
||||
"python", "olmocr/data/prepare_olmocrmix.py",
|
||||
"--dataset-path", str(packaged_dir),
|
||||
"--subset", "test_subset",
|
||||
"--split", "test_split",
|
||||
"--destination", str(unpackaged_dir)
|
||||
"python",
|
||||
"olmocr/data/prepare_olmocrmix.py",
|
||||
"--dataset-path",
|
||||
str(packaged_dir),
|
||||
"--subset",
|
||||
"test_subset",
|
||||
"--split",
|
||||
"test_split",
|
||||
"--destination",
|
||||
str(unpackaged_dir),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert prepare_result.returncode == 0
|
||||
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
|
||||
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
|
||||
|
||||
def relative_files(root: Path):
|
||||
return sorted(
|
||||
path.relative_to(root)
|
||||
for path in root.rglob("*")
|
||||
if path.is_file()
|
||||
)
|
||||
return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
|
||||
|
||||
sample_files = relative_files(sample_dataset)
|
||||
unpacked_files = relative_files(unpacked_processed)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user