mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-09 15:09:40 +00:00
Packaging working better now
This commit is contained in:
parent
557bb9a5e9
commit
702c42f8e7
@ -20,13 +20,13 @@ from typing import Optional
|
|||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
import pypdf
|
import pypdf
|
||||||
|
from lingua import Language
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.filter import PdfFilter
|
from olmocr.filter import PdfFilter
|
||||||
from lingua import Language
|
|
||||||
|
|
||||||
TARGET_IMAGE_DIM = 1024
|
TARGET_IMAGE_DIM = 1024
|
||||||
|
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from .katex.render import compare_rendered_equations, render_equation
|
|||||||
# Tell pytest these are not tests
|
# Tell pytest these are not tests
|
||||||
__test__ = False
|
__test__ = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TableData:
|
class TableData:
|
||||||
"""Class to hold table data and metadata about headers."""
|
"""Class to hold table data and metadata about headers."""
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import tarfile
|
|
||||||
import shutil
|
import shutil
|
||||||
|
import tarfile
|
||||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -231,14 +231,32 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
|||||||
|
|
||||||
response_data = response
|
response_data = response
|
||||||
|
|
||||||
# Create folder structure using first 4 digits of id
|
# Create folder structure
|
||||||
# Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
|
# For allenai/olmOCR-mix-0225: use first 4 characters as folder
|
||||||
|
# For other datasets: preserve the existing structure
|
||||||
|
|
||||||
|
if dataset_path == "allenai/olmOCR-mix-0225":
|
||||||
|
# Standard format: use first 4 characters as folder
|
||||||
folder_name = doc_id[:4]
|
folder_name = doc_id[:4]
|
||||||
file_name = f"{doc_id[4:]}.md"
|
file_name = f"{doc_id[4:]}.md"
|
||||||
|
|
||||||
# Create directory
|
# Create directory
|
||||||
output_dir = processed_dir / folder_name
|
output_dir = processed_dir / folder_name
|
||||||
output_dir.mkdir(exist_ok=True)
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
else:
|
||||||
|
# Custom format: preserve directory structure from doc_id
|
||||||
|
# The doc_id already contains the full path structure
|
||||||
|
if "/" in doc_id:
|
||||||
|
# doc_id contains path separators
|
||||||
|
path_parts = doc_id.rsplit("/", 1)
|
||||||
|
folder_path = Path(path_parts[0])
|
||||||
|
file_name = f"{path_parts[1]}.md"
|
||||||
|
output_dir = processed_dir / folder_path
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
else:
|
||||||
|
# No path separator, put at root
|
||||||
|
file_name = f"{doc_id}.md"
|
||||||
|
output_dir = processed_dir
|
||||||
|
|
||||||
# Write markdown file with front matter and natural text
|
# Write markdown file with front matter and natural text
|
||||||
output_file = output_dir / file_name
|
output_file = output_dir / file_name
|
||||||
@ -268,7 +286,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
|||||||
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
|
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
|
||||||
assert matched_pdf_path.exists(), "Matching PDF not found"
|
assert matched_pdf_path.exists(), "Matching PDF not found"
|
||||||
|
|
||||||
|
# Create symlink path based on dataset type
|
||||||
|
if dataset_path == "allenai/olmOCR-mix-0225":
|
||||||
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
|
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
|
||||||
|
else:
|
||||||
|
# For custom datasets, use the same filename as the markdown
|
||||||
|
symlink_path = output_file.with_suffix(".pdf")
|
||||||
|
|
||||||
# Create relative symlink to the PDF
|
# Create relative symlink to the PDF
|
||||||
if not symlink_path.exists():
|
if not symlink_path.exists():
|
||||||
|
|||||||
@ -17,13 +17,14 @@ import tarfile
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterator, List, Optional, Tuple
|
from typing import Dict, Iterator, List, Optional, Tuple
|
||||||
from olmocr.prompts import PageResponse
|
|
||||||
from olmocr.train.dataloader import FrontMatterParser
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import yaml
|
import yaml
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from olmocr.prompts import PageResponse
|
||||||
|
from olmocr.train.dataloader import FrontMatterParser
|
||||||
|
|
||||||
DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB
|
DEFAULT_MAX_TAR_BYTES = 1_073_741_824 # 1 GiB
|
||||||
|
|
||||||
|
|
||||||
@ -46,19 +47,13 @@ class DocumentRecord:
|
|||||||
pdf_relpath: Optional[str] = None
|
pdf_relpath: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
|
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
|
||||||
"""Reconstruct the doc_id used in parquet/index space."""
|
"""Reconstruct the doc_id used in parquet/index space."""
|
||||||
rel = md_path.relative_to(processed_root)
|
rel = md_path.relative_to(processed_root)
|
||||||
if len(rel.parts) < 2:
|
|
||||||
stem = rel.stem
|
# Simply preserve the directory structure as the doc_id
|
||||||
prefix = rel.stem
|
# Convert path to doc_id by removing extension
|
||||||
else:
|
return str(rel.with_suffix(""))
|
||||||
prefix = rel.parts[0]
|
|
||||||
stem = Path(rel.parts[-1]).stem
|
|
||||||
return f"{prefix}{stem}"
|
|
||||||
|
|
||||||
|
|
||||||
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
|
def infer_pdf_path(md_path: Path, doc_id: str, pdf_root: Optional[Path]) -> Path:
|
||||||
@ -249,9 +244,7 @@ def write_pdf_tarballs(
|
|||||||
rec.chunk_name = tar_name
|
rec.chunk_name = tar_name
|
||||||
inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
|
inner_ref = f"{tar_name}:{rec.doc_id}.pdf"
|
||||||
rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
|
rec.pdf_relpath = f"{normalized_dir}/{inner_ref}" if normalized_dir else inner_ref
|
||||||
manifest_rows.append(
|
manifest_rows.append({"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath})
|
||||||
{"doc_id": rec.doc_id, "chunk": tar_name, "arcname": f"{rec.doc_id}.pdf", "pdf_relpath": rec.pdf_relpath}
|
|
||||||
)
|
|
||||||
|
|
||||||
actual_size = tar_path.stat().st_size
|
actual_size = tar_path.stat().st_size
|
||||||
if actual_size > max_bytes:
|
if actual_size > max_bytes:
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
import torch
|
|
||||||
import base64
|
import base64
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||||
|
|
||||||
@ -22,7 +22,6 @@ if __name__ == "__main__":
|
|||||||
# Render page 1 to an image
|
# Render page 1 to an image
|
||||||
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
|
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
|
||||||
|
|
||||||
|
|
||||||
# Build the full prompt
|
# Build the full prompt
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
@ -46,7 +45,6 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
inputs = {key: value.to(device) for (key, value) in inputs.items()}
|
inputs = {key: value.to(device) for (key, value) in inputs.items()}
|
||||||
|
|
||||||
|
|
||||||
# Generate the output
|
# Generate the output
|
||||||
output = model.generate(
|
output = model.generate(
|
||||||
**inputs,
|
**inputs,
|
||||||
@ -59,8 +57,6 @@ if __name__ == "__main__":
|
|||||||
# Decode the output
|
# Decode the output
|
||||||
prompt_length = inputs["input_ids"].shape[1]
|
prompt_length = inputs["input_ids"].shape[1]
|
||||||
new_tokens = output[:, prompt_length:]
|
new_tokens = output[:, prompt_length:]
|
||||||
text_output = processor.tokenizer.batch_decode(
|
text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
||||||
new_tokens, skip_special_tokens=True
|
|
||||||
)
|
|
||||||
|
|
||||||
print(text_output)
|
print(text_output)
|
||||||
@ -20,14 +20,19 @@ def test_repackage_and_prepare_olmocrmix():
|
|||||||
|
|
||||||
repackage_result = subprocess.run(
|
repackage_result = subprocess.run(
|
||||||
[
|
[
|
||||||
"python", "olmocr/data/repackage_olmocrmix.py",
|
"python",
|
||||||
"--processed-dir", str(sample_dataset),
|
"olmocr/data/repackage_olmocrmix.py",
|
||||||
"--subset", "test_subset",
|
"--processed-dir",
|
||||||
"--split", "test_split",
|
str(sample_dataset),
|
||||||
"--output-dir", str(packaged_dir)
|
"--subset",
|
||||||
|
"test_subset",
|
||||||
|
"--split",
|
||||||
|
"test_split",
|
||||||
|
"--output-dir",
|
||||||
|
str(packaged_dir),
|
||||||
],
|
],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True
|
text=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
|
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
|
||||||
@ -36,20 +41,24 @@ def test_repackage_and_prepare_olmocrmix():
|
|||||||
parquet_file = packaged_dir / "test_subset_test_split.parquet"
|
parquet_file = packaged_dir / "test_subset_test_split.parquet"
|
||||||
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
|
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
|
||||||
|
|
||||||
|
|
||||||
# Step 2: Repackage the sample dataset into parquet + tarballs
|
# Step 2: Repackage the sample dataset into parquet + tarballs
|
||||||
unpackaged_dir = temp_path / "unpackaged"
|
unpackaged_dir = temp_path / "unpackaged"
|
||||||
|
|
||||||
prepare_result = subprocess.run(
|
prepare_result = subprocess.run(
|
||||||
[
|
[
|
||||||
"python", "olmocr/data/prepare_olmocrmix.py",
|
"python",
|
||||||
"--dataset-path", str(packaged_dir),
|
"olmocr/data/prepare_olmocrmix.py",
|
||||||
"--subset", "test_subset",
|
"--dataset-path",
|
||||||
"--split", "test_split",
|
str(packaged_dir),
|
||||||
"--destination", str(unpackaged_dir)
|
"--subset",
|
||||||
|
"test_subset",
|
||||||
|
"--split",
|
||||||
|
"test_split",
|
||||||
|
"--destination",
|
||||||
|
str(unpackaged_dir),
|
||||||
],
|
],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True
|
text=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert prepare_result.returncode == 0
|
assert prepare_result.returncode == 0
|
||||||
@ -62,11 +71,7 @@ def test_repackage_and_prepare_olmocrmix():
|
|||||||
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
|
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
|
||||||
|
|
||||||
def relative_files(root: Path):
|
def relative_files(root: Path):
|
||||||
return sorted(
|
return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
|
||||||
path.relative_to(root)
|
|
||||||
for path in root.rglob("*")
|
|
||||||
if path.is_file()
|
|
||||||
)
|
|
||||||
|
|
||||||
sample_files = relative_files(sample_dataset)
|
sample_files = relative_files(sample_dataset)
|
||||||
unpacked_files = relative_files(unpacked_processed)
|
unpacked_files = relative_files(unpacked_processed)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user