Merge branch 'main' into jakep/new_trainer

This commit is contained in:
Jake Poznanski 2025-06-27 16:21:58 +00:00
commit b96454b786
21 changed files with 517 additions and 622 deletions

View File

@ -271,25 +271,26 @@ jobs:
outputs: type=registry
no-cache: true
- name: Setup Beaker CLI
uses: allenai/setup-beaker@v2
with:
token: ${{ secrets.BEAKER_TOKEN }}
version: latest
# jakep: push to beaker can't work because of limitted disk space on these runners
# jakep: (you can try by setting load: true above, but you'll need a larger runner)
# - name: Setup Beaker CLI
# uses: allenai/setup-beaker@v2
# with:
# token: ${{ secrets.BEAKER_TOKEN }}
# version: latest
# - name: Debug Docker images
# run: docker images
- name: Push to Beaker
env:
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
run: |
# Get the version without 'v' prefix
VERSION=${GITHUB_REF#refs/tags/v}
# Push the Docker image to Beaker
beaker image create \
--name "olmocr-inference-$VERSION" \
--workspace ai2/olmocr \
"docker://${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$VERSION"
# - name: Push to Beaker
# env:
# BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
# run: |
# VERSION=${{ steps.meta.outputs.version }}
# beaker image create \
# --name "olmocr-inference-$VERSION" \
# --workspace ai2/olmocr \
# alleninstituteforai/olmocr:$VERSION
- name: Clean up after build
if: always()
run: |

1
.gitignore vendored
View File

@ -21,6 +21,7 @@ olmOCR-bench/*
table_data*/
/synth*/
dolma_samples/*
old_train/
/*.html
scoreelo.csv
debug.log

View File

@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
## [v0.1.76](https://github.com/allenai/olmocr/releases/tag/v0.1.76) - 2025-06-23
## [v0.1.75](https://github.com/allenai/olmocr/releases/tag/v0.1.75) - 2025-06-17
## [v0.1.74](https://github.com/allenai/olmocr/releases/tag/v0.1.74) - 2025-06-17
## [v0.1.73](https://github.com/allenai/olmocr/releases/tag/v0.1.73) - 2025-06-17
## [v0.1.72](https://github.com/allenai/olmocr/releases/tag/v0.1.72) - 2025-06-17
## [v0.1.71](https://github.com/allenai/olmocr/releases/tag/v0.1.71) - 2025-05-30
## [v0.1.70](https://github.com/allenai/olmocr/releases/tag/v0.1.70) - 2025-05-23

View File

@ -47,19 +47,19 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
unzip
ENV PYTHONUNBUFFERED=1
WORKDIR /root
COPY pyproject.toml pyproject.toml
COPY olmocr/version.py olmocr/version.py
# keep the build context clean
WORKDIR /build
COPY . /build
# Needed to resolve setuptools dependencies
ENV UV_INDEX_STRATEGY="unsafe-best-match"
RUN uv pip install --system --no-cache -e ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
RUN uv pip install --system --no-cache ".[bench]"
RUN playwright install-deps
RUN playwright install chromium
COPY olmocr olmocr
COPY scripts scripts
RUN python3 -m olmocr.pipeline --help

View File

@ -35,6 +35,7 @@ Features:
- (Based on a 7B parameter VLM, so it requires a GPU)
### News
- June 17, 2025 - v0.1.75 - Switch from sglang to vllm based inference pipeline, updated docker image to CUDA 12.8.
- May 23, 2025 - v0.1.70 - Official docker support and images are now available! [See Docker usage](#using-docker)
- May 19, 2025 - v0.1.68 - [olmOCR-Bench](https://github.com/allenai/olmocr/tree/main/olmocr/bench) launch, scoring 77.4. Launch includes 2 point performance boost in olmOCR pipeline due to bug fixes with prompts.
- Mar 17, 2025 - v0.1.60 - Performance improvements due to better temperature selection in sampling.
@ -49,29 +50,29 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
<thead>
<tr>
<th align="left"><strong>Model</strong></th>
<th align="center">AR</th>
<th align="center">OSM</th>
<th align="center">TA</th>
<th align="center">OS</th>
<th align="center">HF</th>
<th align="center">MC</th>
<th align="center">LTT</th>
<th align="center">ArXiv</th>
<th align="center">Old Scans Math</th>
<th align="center">Tables</th>
<th align="center">Old Scans</th>
<th align="center">Headers and Footers</th>
<th align="center">Multi column</th>
<th align="center">Long tiny text</th>
<th align="center">Base</th>
<th align="center">Overall Score</th>
<th align="center">Overall</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Marker v1.6.2</td>
<td align="center">24.3</td>
<td align="center">22.1</td>
<td align="center">69.8</td>
<td align="center">24.3</td>
<td align="center">87.1</td>
<td align="center">71.0</td>
<td align="center">76.9</td>
<td align="center"><strong>99.5</strong></td>
<td align="center">59.4 ± 1.1</td>
<td align="left">Marker v1.7.5 (base)</td>
<td align="center">76.0</td>
<td align="center">57.9</td>
<td align="center">57.6</td>
<td align="center">27.8</td>
<td align="center">84.9</td>
<td align="center">72.9</td>
<td align="center">84.6</td>
<td align="center">99.1</td>
<td align="center">70.1 ± 1.1</td>
</tr>
<tr>
<td align="left">MinerU v1.3.10</td>
@ -94,24 +95,25 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
<td align="center">93.6</td>
<td align="center">71.3</td>
<td align="center">77.1</td>
<td align="center">99.4</td>
<td align="center"><strong>99.4</strong></td>
<td align="center">72.0 ± 1.1</td>
</tr>
<tr>
<td align="left">olmOCR v0.1.68 (pipeline.py)</td>
<td align="center">75.6</td>
<td align="center">75.1</td>
<td align="center">70.2</td>
<td align="center"><strong>44.5</strong></td>
<td align="center">93.4</td>
<td align="center"><strong>79.4</strong></td>
<td align="center">81.7</td>
<td align="center">99.0</td>
<td align="center"><strong>77.4 ± 1.0</strong></td>
<td align="left">olmOCR v0.1.75 (Anchored)</td>
<td align="center">74.9</td>
<td align="center">71.2</td>
<td align="center">71.0</td>
<td align="center">42.2</td>
<td align="center">94.5</td>
<td align="center"><strong>78.3</strong></td>
<td align="center">73.3</td>
<td align="center">98.3</td>
<td align="center"><strong>75.5 ± 1.0</strong></td>
</tr>
</tbody>
</table>
### Installation
Requirements:
@ -136,7 +138,10 @@ conda activate olmocr
pip install olmocr[bench]
# For actually converting the files with your own GPU
pip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
pip install olmocr.[gpu] --extra-index-url https://download.pytorch.org/whl/cu128
# Recommended: Install flash infer for faster inference on GPU
pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
```
### Local Usage Example

View File

@ -14,6 +14,9 @@ olmOCR-bench operates on single page PDFs directly. We make this choice because
We have run the benchmark against some contemporary OCR pipelines, but it is really easy
to run it against your own OCR tools. Your tool just needs to support Markdown or plain text output.
<div align="center">
<img src="https://github.com/allenai/olmocr/blob/main/scripts/pareto/ocr_pareto.png?raw=true" width=800/>
</div>
## Results
@ -37,7 +40,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
<td align="left">GOT OCR</td>
<td align="center">52.7</td>
<td align="center">52.0</td>
<td align="center">0.2</td>
<td align="center">0.20</td>
<td align="center">22.1</td>
<td align="center">93.6</td>
<td align="center">42.0</td>
@ -46,16 +49,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
<td align="center">48.3 ± 1.1</td>
</tr>
<tr>
<td align="left">Marker v1.6.2</td>
<td align="center">24.3</td>
<td align="center">22.1</td>
<td align="center">69.8</td>
<td align="center">24.3</td>
<td align="center">87.1</td>
<td align="center">71.0</td>
<td align="center">76.9</td>
<td align="center"><strong>99.5</strong></td>
<td align="center">59.4 ± 1.1</td>
<td align="left">Marker v1.7.5 (base, force_ocr)</td>
<td align="center">76.0</td>
<td align="center">57.9</td>
<td align="center">57.6</td>
<td align="center">27.8</td>
<td align="center">84.9</td>
<td align="center">72.9</td>
<td align="center">84.6</td>
<td align="center">99.1</td>
<td align="center">70.1 ± 1.1</td>
</tr>
<tr>
<td align="left">MinerU v1.3.10</td>
@ -78,9 +81,21 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
<td align="center">93.6</td>
<td align="center">71.3</td>
<td align="center">77.1</td>
<td align="center">99.4</td>
<td align="center"><strong>99.4</strong></td>
<td align="center">72.0 ± 1.1</td>
</tr>
<tr>
<td align="left">Nanonets OCR</td>
<td align="center">67.0</td>
<td align="center">68.6</td>
<td align="center"><strong>77.7</strong></td>
<td align="center">39.5</td>
<td align="center">40.7</td>
<td align="center">69.9</td>
<td align="center">53.4</td>
<td align="center">99.3</td>
<td align="center">64.5 ± 1.1</td>
</tr>
<tr>
<td align="left">GPT-4o (No Anchor)</td>
<td align="center">51.5</td>
@ -154,33 +169,39 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
<td align="center">65.5 ± 1.2</td>
</tr>
<tr>
<td align="left">olmOCR v0.1.68 (No Anchor)</td>
<td align="center">72.1</td>
<td align="center">74.7</td>
<td align="left">olmOCR v0.1.75 (No Anchor)</td>
<td align="center">71.5</td>
<td align="center">43.7</td>
<td align="center">91.6</td>
<td align="center">78.5</td>
<td align="center">80.5</td>
<td align="center">98.1</td>
<td align="center">76.3 ± 1.1</td>
<td align="center">71.4</td>
<td align="center">71.4</td>
<td align="center"><strong>42.8</strong></td>
<td align="center">94.1</td>
<td align="center">77.7</td>
<td align="center">71.0</td>
<td align="center">97.8</td>
<td align="center">74.7 ± 1.1</td>
</tr>
<tr>
<td align="left">olmOCR v0.1.68 (Anchored)</td>
<td align="center">75.6</td>
<td align="center">75.1</td>
<td align="center">70.2</td>
<td align="center"><strong>44.5</strong></td>
<td align="center">93.4</td>
<td align="center"><strong>79.4</strong></td>
<td align="center">81.7</td>
<td align="center">99.0</td>
<td align="center"><strong>77.4 ± 1.0</strong></td>
<td align="left">olmOCR v0.1.75 (Anchored)</td>
<td align="center">74.9</td>
<td align="center">71.2</td>
<td align="center">71.0</td>
<td align="center">42.2</td>
<td align="center">94.5</td>
<td align="center"><strong>78.3</strong></td>
<td align="center">73.3</td>
<td align="center">98.3</td>
<td align="center"><strong>75.5 ± 1.0</strong></td>
</tr>
</tbody>
</table>
<sup><sub>There was a small drop in scores from olmOCR v0.1.68 (77.4), which is due to two factors. One, is that we have adjusted our benchmark code to not include
any "fallback" mechanism when measuring benchmark scores (though it still exists when you run olmocr.pipeline). Second, there is a small drop in scores as we have updated
from sglang 0.4.2 to vllm 0.9.1. In net, we think the upgrade to vllm is the right choice, given that sglang 0.4.6 had even lower scores by one point, and vllm comes with a
small performance boost, and great support for quantization.
</sub></sup>
## Sourcing Documents and Tests
We define 7 distinct document types that we found olmOCR (or its earlier iterations) often struggled to process and defined custom acquisition strategies for each (described below). We removed documents that both contained PII and were not meant for public dissemination. We also decontaminate against documents that appear in olmOCR-Mix via URL level deduplication. To scale creation of test cases over these documents, we combined manual design and review with prompting GPT-4o.
@ -288,6 +309,3 @@ We have an internal data annotation tool that can be used to review the question
```bash
python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force
```

View File

@ -223,6 +223,7 @@ if __name__ == "__main__":
available_methods = {
"olmocr_pipeline": ("olmocr.bench.runners.run_olmocr_pipeline", "run_olmocr_pipeline"),
"gotocr": ("olmocr.bench.runners.run_gotocr", "run_gotocr"),
"nanonetsocr": ("olmocr.bench.runners.run_nanonetsocr", "run_nanonetsocr"),
"marker": ("olmocr.bench.runners.run_marker", "run_marker"),
"mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
"chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),

View File

@ -1,6 +1,7 @@
import os
import tempfile
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
@ -15,10 +16,22 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str:
if _marker_converter is None:
# Create a configuration dictionary with the necessary settings
config = {
"texify_inline_spans": True, # This enables conversion of inline math to LaTeX
"force_ocr": True, # This enables conversion of inline math to LaTeX
"use_llm": False, # We would prefer to run just plain marker for reporting bench results, not hybrid mode
"disable_tqdm": True, # Disable tqdm for cleaner output
"recognition_batch_size": 256,
"layout_batch_size": 48,
"detection_batch_size": 48,
"equation_batch_size": 64,
"table_rec_batch_size": 48,
"ocr_error_batch_size": 64,
}
config_parser = ConfigParser(config)
_marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config)
_marker_converter = PdfConverter(
artifact_dict=create_model_dict(),
config=config_parser.generate_config_dict(),
)
# Extract the specific page from the PDF
pdf_to_process = pdf_path

View File

@ -0,0 +1,89 @@
import base64
import os
import re
import tempfile
import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
from olmocr.data.renderpdf import render_pdf_to_base64png
_model = None
_tokenizer = None
_processor = None
_device = None
def load_model(model_path: str = "nanonets/Nanonets-OCR-s"):
global _model, _tokenizer, _processor, _device
if _model is None:
_device = "cuda" if torch.cuda.is_available() else "cpu"
_model = AutoModelForImageTextToText.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
# attn_implementation="flash_attention_2"
)
_model.eval()
_tokenizer = AutoTokenizer.from_pretrained(model_path)
_processor = AutoProcessor.from_pretrained(model_path)
return _model, _tokenizer, _processor
async def run_nanonetsocr(pdf_path: str, page_num: int = 1, model_path: str = "nanonets/Nanonets-OCR-s", max_new_tokens: int = 4096, **kwargs) -> str:
"""
Convert page of a PDF file to markdown using NANONETS-OCR.
This function renders the first page of the PDF to an image, runs OCR on that image,
and returns the OCR result as a markdown-formatted string.
Args:
pdf_path (str): The local path to the PDF file.
Returns:
str: The OCR result in markdown format.
"""
model, tokenizer, processor = load_model(model_path)
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=1024)
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
image_data = base64.b64decode(image_base64)
temp_file.write(image_data)
temp_image_path = temp_file.name
try:
image = Image.open(temp_image_path)
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{"type": "image", "image": f"file://{temp_image_path}"},
{"type": "text", "text": prompt},
],
},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt", use_fast=True)
inputs = inputs.to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
cleaned_text = re.sub(r"<page_number>\d+</page_number>", "", output_text[0])
return cleaned_text
finally:
try:
os.unlink(temp_image_path)
except Exception as e:
print(f"Warning: Failed to remove temporary file {temp_image_path}: {e}")

View File

@ -123,6 +123,8 @@ def normalize_text(md_content: str) -> str:
# Remove markdown bold formatting (** or __ for bold)
md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
md_content = re.sub(r"__(.*?)__", r"\1", md_content)
md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
# Remove markdown italics formatting (* or _ for italics)
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)

View File

@ -1,86 +0,0 @@
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
import boto3
from tqdm import tqdm
# Configuration
BUCKET = "ai2-llm"
PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results"
OUTPUT_FILENAME = "all_completed_files.txt"
def process_file(key: str):
"""
Process a single S3 file given by its key.
Reads a jsonl file from S3, decodes each line,
extracts the 'Source-File' from the 'metadata' field,
and returns a list of these source file strings.
"""
# Create a new S3 client in the worker thread (thread-safe)
s3 = boto3.client("s3")
extracted_lines = []
try:
response = s3.get_object(Bucket=BUCKET, Key=key)
for raw_line in response["Body"].iter_lines():
try:
# Decode the line from bytes to text
line_str = raw_line.decode("utf-8")
except UnicodeDecodeError as e:
print(f"Skipping a line in {key} due to decode error: {e}")
continue
try:
data = json.loads(line_str)
except json.JSONDecodeError as e:
print(f"Skipping a malformed json line in {key}: {e}")
continue
# Extract 'Source-File' from metadata if present
metadata = data.get("metadata", {})
source_file = metadata.get("Source-File")
if source_file:
extracted_lines.append(source_file)
except Exception as e:
print(f"Error processing file {key}: {e}")
return extracted_lines
def main():
s3 = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
# Gather all S3 object keys under the specified prefix
keys = []
for page in page_iterator:
if "Contents" not in page:
continue
for obj in page["Contents"]:
keys.append(obj["Key"])
print(f"Found {len(keys)} files to process.")
# Open the output file for writing
with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file:
# Create a thread pool to process files concurrently.
# Adjust max_workers based on your environment and workload.
with ProcessPoolExecutor() as executor:
# Submit all processing jobs and map each future to its key
future_to_key = {executor.submit(process_file, key): key for key in keys}
# Use tqdm to wrap the as_completed iterator for progress display
for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"):
try:
source_files = future.result()
# Write each extracted line to the output file as soon as the future completes
for source in source_files:
output_file.write(source + "\n")
# Optionally flush after each completed task
output_file.flush()
except Exception as e:
key = future_to_key[future]
print(f"Exception occurred for file {key}: {e}")
print(f"Finished writing the source file names to {OUTPUT_FILENAME}")
if __name__ == "__main__":
main()

View File

@ -32,13 +32,12 @@ from tqdm import tqdm
from olmocr.check import (
check_poppler_version,
check_sglang_version,
check_torch_gpu_available,
)
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter.filter import Language, PdfFilter
from olmocr.image_utils import convert_image_to_pdf_bytes, is_jpeg, is_png
from olmocr.metrics import MetricsKeeper, WorkerTracker, cpu_vs_wall
from olmocr.metrics import MetricsKeeper, WorkerTracker
from olmocr.prompts import PageResponse, build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from olmocr.s3_utils import (
@ -331,7 +330,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
async def process_pdf(args, worker_id: int, pdf_orig_path: str):
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
try:
data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path))
tf.write(data)
@ -349,6 +348,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
tf.write(convert_image_to_pdf_bytes(tf.name))
tf.flush()
try:
try:
reader = PdfReader(tf.name)
num_pages = reader.get_num_pages()
@ -400,6 +400,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
# You can't build a dolma doc with even 1 failed page, so just get out of here
# However, you don't want to propagate an exception higher up and cancel the entire work_group
return None
finally:
if os.path.exists(tf.name):
os.unlink(tf.name)
def build_dolma_document(pdf_orig_path, page_results):
@ -705,19 +708,31 @@ async def vllm_server_ready():
raise Exception("vllm server did not become ready after waiting.")
async def download_model(model_name_or_path: str):
if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
logger.info(f"Downloading model directory from '{model_name_or_path}'")
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
download_directory([model_name_or_path], model_cache_dir)
return model_cache_dir
elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
logger.info(f"Using local model path at '{model_name_or_path}'")
return model_name_or_path
else:
logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
snapshot_download(repo_id=model_name_or_path)
return model_name_or_path
async def download_model(model_name_or_path: str, max_retries: int = 5):
for retry in range(max_retries):
try:
if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
logger.info(f"Downloading model directory from '{model_name_or_path}'")
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
# Delete existing model cache directory if it exists
if os.path.exists(model_cache_dir):
shutil.rmtree(model_cache_dir)
download_directory([model_name_or_path], model_cache_dir)
return model_cache_dir
elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
logger.info(f"Using local model path at '{model_name_or_path}'")
return model_name_or_path
else:
logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
snapshot_download(repo_id=model_name_or_path)
return model_name_or_path
except Exception:
if retry == max_retries - 1:
raise # Raise on final attempt and fail the job
sleep_time = random.randrange(2, 20) * 2**retry
logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})")
await asyncio.sleep(random.randrange(10, 30) * 2**retry)
async def metrics_reporter(work_queue):
@ -906,6 +921,7 @@ def print_stats(args, root_work_queue):
logger.warning(f"Error processing {s3_path}: {e}")
return 0, 0, 0, 0, 0, set(), 0, 0
print(f"\nCompleted work items {completed_items:,} out of {total_items:,}: {completed_items/total_items*100:.2f}%")
print("\nProcessing output files...")
docs_total = 0
input_tokens_total = 0
@ -1033,8 +1049,8 @@ async def main():
# Wait a little bit so that not all beaker jobs in a task start at the same time and download the model at the same time
replica_count = int(os.environ.get("BEAKER_REPLICA_COUNT", "1"))
interval = 10 if (replica_count - 1) * 10 <= 240 else 240 / max(1, replica_count - 1)
sleep_time = int(int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval)
interval = 10 if (replica_count - 1) * 10 <= 30 else 30 / max(1, replica_count - 1)
sleep_time = int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval
logger.info(f"Beaker job sleeping for {sleep_time} seconds to stagger model downloads")
await asyncio.sleep(sleep_time)
@ -1155,7 +1171,6 @@ async def main():
await vllm_server_ready()
metrics_task = asyncio.create_task(metrics_reporter(work_queue))
cpu_monitor_task = asyncio.create_task(cpu_vs_wall(10))
# Create worker tasks to process the queue concurrently.
worker_tasks = []
@ -1171,7 +1186,6 @@ async def main():
vllm_server.cancel()
metrics_task.cancel()
cpu_monitor_task.cancel()
# Output final metrics summary
metrics_summary = metrics.get_metrics_summary()
@ -1201,14 +1215,14 @@ async def main():
logger.info(f"Server Input tokens/sec rate: {rates['server_input_tokens_per_sec']:.2f}")
if "server_output_tokens_per_sec" in rates:
logger.info(f"Server Output tokens/sec rate: {rates['server_output_tokens_per_sec']:.2f}")
if "finished_input_tokens" in rates:
logger.info(f"Finished Input tokens/sec rate: {rates['finished_input_tokens']:.2f}")
if "finished_output_tokens" in rates:
logger.info(f"Finished Output tokens/sec rate: {rates['finished_output_tokens']:.2f}")
if "finished_input_tokens_per_sec" in rates:
logger.info(f"Finished Input tokens/sec rate: {rates['finished_input_tokens_per_sec']:.2f}")
if "finished_output_tokens_per_sec" in rates:
logger.info(f"Finished Output tokens/sec rate: {rates['finished_output_tokens_per_sec']:.2f}")
logger.info("=" * 80)
logger.info("Work done")
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())

View File

@ -0,0 +1,28 @@
# pip install llmcompressor
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration
MODEL_ID = "/home/ubuntu/olmocr/olmOCR-7B-0225-preview"
model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Configure the simple PTQ quantization
# recipe = QuantizationModifier(
# targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Configure pre-defined qwen2vl recipe
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["re:.*lm_head", "re:visual.*"],
)
# Apply the quantization algorithm.
oneshot(model=model, recipe=recipe)
# Save the model.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic-Recipe"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

View File

@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "71"
_PATCH = "76"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""

View File

@ -1,12 +0,0 @@
#!/bin/bash
set -e
VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "$VERSION"
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION .
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging -t olmocr-tagging-$VERSION .
beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 294 KiB

View File

@ -64,12 +64,12 @@ data = {
"MinerU",
"Gemini Flash 2",
"Gemini Flash 2 (Batch)",
"Marker v1.6.2",
"Marker v1.7.5",
"Ours",
"Qwen 2 VL",
"Qwen 2.5 VL",
],
COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178], # Same cost as Ours # Same cost as Ours
COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 1492, 178, 178, 178], # Same cost as Ours # Same cost as Ours
PERF_COLUMN_NAME: [
69.9, # GPT-4o (Anchored)
69.9, # Same performance for batch
@ -77,8 +77,8 @@ data = {
61.5, # MinerU
63.8, # Gemini Flash 2 (Anchored)
63.8, # Same performance for batch
59.4, # marker v1.6.2
77.4, # Ours (performance is the same across hardware)
70.1, # marker v1.7.5 base
75.5, # Ours (performance is the same across hardware)
31.5, # Qwen2VL
65.5, # Qwen2.5VL
],
@ -94,7 +94,7 @@ model_categories = {
"MinerU": "Open Source Tool",
"Gemini Flash 2": "Commercial VLM",
"Gemini Flash 2 (Batch)": "Commercial VLM",
"Marker v1.6.2": "Open Source Tool",
"Marker v1.7.5": "Open Source Tool",
"Ours": "Ours",
"Qwen 2 VL": "Open VLM",
"Qwen 2.5 VL": "Open VLM",
@ -131,8 +131,8 @@ model_label_offsets = {
"Mistral OCR": [-20, 10],
"MinerU": [-15, -20],
"Gemini Flash 2": [-10, 10],
"Gemini Flash 2 (Batch)": [-50, -15],
"Marker v1.6.2": [-35, -20],
"Gemini Flash 2 (Batch)": [-50, -20],
"Marker v1.7.5": [-25, -20],
"Ours": [-20, 10],
"Qwen 2 VL": [-35, 10],
"Qwen 2.5 VL": [-35, 10],

202
scripts/run_marker_benchmark.sh Executable file
View File

@ -0,0 +1,202 @@
#!/bin/bash
# Runs marker benchmark, measuring both olmOCR-bench performance and per document processing performance
# ./scripts/run_marker_benchmark.sh
# ./scripts/run_marker_benchmark.sh 1.7.5
set -e
# Parse command line arguments
MARKER_VERSION="${1:-1.7.5}"
echo "Using marker version: $MARKER_VERSION"
# Check for uncommitted changes
if ! git diff-index --quiet HEAD --; then
echo "Error: There are uncommitted changes in the repository."
echo "Please commit or stash your changes before running the benchmark."
echo ""
echo "Uncommitted changes:"
git status --short
exit 1
fi
# Use conda environment Python if available, otherwise use system Python
if [ -n "$CONDA_PREFIX" ]; then
PYTHON="$CONDA_PREFIX/bin/python"
echo "Using conda Python from: $CONDA_PREFIX"
else
PYTHON="python"
echo "Warning: No conda environment detected, using system Python"
fi
# Get version from version.py
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "OlmOCR version: $VERSION"
# Get first 10 characters of git hash
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
echo "Git hash: $GIT_HASH"
# Get current git branch name
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
echo "Git branch: $GIT_BRANCH"
# Create full image tag
IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
echo "Building Docker image with tag: $IMAGE_TAG"
# Build the Docker image
echo "Building Docker image..."
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
# Get Beaker username
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
echo "Beaker user: $BEAKER_USER"
# Push image to beaker
echo "Trying to push image to Beaker..."
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
fi
# Create Python script to run beaker experiment
cat << 'EOF' > /tmp/run_benchmark_experiment.py
import sys
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
# Get image tag, beaker user, git branch, git hash, and marker version from command line
image_tag = sys.argv[1]
beaker_user = sys.argv[2]
git_branch = sys.argv[3]
git_hash = sys.argv[4]
marker_version = sys.argv[5]
# Initialize Beaker client
b = Beaker.from_env(default_workspace="ai2/olmocr")
# Check if AWS credentials secret exists
aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
try:
# Try to get the secret to see if it exists
b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
has_aws_creds = True
print(f"Found AWS credentials secret: {aws_creds_secret}")
except:
has_aws_creds = False
print(f"AWS credentials secret not found: {aws_creds_secret}")
# First experiment: Original benchmark job
commands = []
if has_aws_creds:
commands.extend([
"mkdir -p ~/.aws",
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
])
commands.extend([
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
"cd olmOCR-bench && git lfs pull && cd ..",
f"pip install marker-pdf=={marker_version}",
"pip install --upgrade torchvision",
"python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data",
"python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
])
# Build task spec with optional env vars
task_spec_args = {
"name": "marker-benchmark",
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
"command": [
"bash", "-c",
" && ".join(commands)
],
"context": TaskContext(
priority=Priority.normal,
preemptible=True,
),
"resources": TaskResources(gpu_count=1),
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
"result": ResultSpec(path="/noop-results"),
}
# Add env vars if AWS credentials exist
if has_aws_creds:
task_spec_args["env_vars"] = [
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
]
# Create first experiment spec
experiment_spec = ExperimentSpec(
description=f"Marker {marker_version} Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
budget="ai2/oe-data",
tasks=[TaskSpec(**task_spec_args)],
)
# Create the first experiment
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
print(f"Created benchmark experiment: {experiment.id}")
print(f"View at: https://beaker.org/ex/{experiment.id}")
print("-------")
print("")
perf_commands = []
if has_aws_creds:
perf_commands.extend([
"mkdir -p ~/.aws",
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
])
perf_commands.extend([
f"pip install marker-pdf=={marker_version}",
"pip install --upgrade torchvision",
"pip install awscli",
"aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/",
# Tried with workers 8, but it was taking a really huge amount of time
#"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker"
])
# Build performance task spec
perf_task_spec_args = {
"name": "marker-performance",
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
"command": [
"bash", "-c",
" && ".join(perf_commands)
],
"context": TaskContext(
priority=Priority.normal,
preemptible=True,
),
"resources": TaskResources(gpu_count=1),
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
"result": ResultSpec(path="/noop-results"),
}
# Add env vars if AWS credentials exist
if has_aws_creds:
perf_task_spec_args["env_vars"] = [
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
]
# Create performance experiment spec
perf_experiment_spec = ExperimentSpec(
description=f"Marker {marker_version} Performance Test - Branch: {git_branch}, Commit: {git_hash}",
budget="ai2/oe-data",
tasks=[TaskSpec(**perf_task_spec_args)],
)
# Create the performance experiment
perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr")
print(f"Created performance experiment: {perf_experiment.id}")
print(f"View at: https://beaker.org/ex/{perf_experiment.id}")
EOF
# Run the Python script to create the experiments
echo "Creating Beaker experiments..."
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH $MARKER_VERSION
# Clean up temporary file
rm /tmp/run_benchmark_experiment.py
echo "Benchmark experiments submitted successfully!"

9
scripts/sync_beaker_image.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "$VERSION"
docker pull alleninstituteforai/olmocr:v$VERSION
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION alleninstituteforai/olmocr:v$VERSION

View File

@ -1,400 +0,0 @@
# The idea is that you have a Qwen2-VL-7B model located here:s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/"
# You need to load it in both hugging face transformers, and send page 1 of edgar.pdf to it from tests/gnarly_pdfs
# Compare that the temperature 0 sampled result is the same
import asyncio
import base64
import json
import math
import os
import unittest
from io import BytesIO
from pathlib import Path
from unittest.mock import AsyncMock, patch
import numpy as np
import pytest
import torch
import torch.nn.functional as F
from httpx import AsyncClient
from PIL import Image
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
from olmocr.pipeline import (
SGLANG_SERVER_PORT,
build_page_query,
get_anchor_text,
render_pdf_to_base64png,
sglang_server_ready,
sglang_server_task,
)
from olmocr.prompts import PageResponse
MODEL_FINETUNED_PATH = (
"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/"
)
@pytest.mark.nonci
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
# Mock arguments
self.args = AsyncMock()
self.args.workspace = "/tmp/test_workspace"
self.args.model = [MODEL_FINETUNED_PATH]
self.args.model_chat_template = "qwen2-vl"
self.args.target_longest_image_dim = 1024
self.args.target_anchor_text_len = 6000
self.args.model_max_context = 8192
# Create a temporary workspace directory
os.makedirs(self.args.workspace, exist_ok=True)
# Set up a semaphore for server tasks
self.semaphore = asyncio.Semaphore(1)
self.maxDiff = None
# # Start the sglang server
# self.my_server_task = asyncio.create_task(sglang_server_task(self.args, self.semaphore))
# # Wait for the server to become ready
# await sglang_server_ready()
async def test_sglang_server_initialization_and_request(self):
# Mock data paths
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
# Send a single request to the sglang server for page 1
async with AsyncClient(timeout=600) as session:
query = await build_page_query(
str(self.test_pdf_path),
page=1,
target_longest_image_dim=self.args.target_longest_image_dim,
target_anchor_text_len=self.args.target_anchor_text_len,
)
COMPLETION_URL = f"http://localhost:{30000}/v1/chat/completions"
query["temperature"] = 0.0
query["logprobs"] = True
query["top_logprobs"] = 5
response = await session.post(COMPLETION_URL, json=query)
print(response.text)
# Check the server response
self.assertEqual(response.status_code, 200)
response_data = response.json()
self.assertIn("choices", response_data)
self.assertGreater(len(response_data["choices"]), 0)
model_response_json = json.loads(response_data["choices"][0]["message"]["content"])
page_response = PageResponse(**model_response_json)
print(page_response)
self.assertEqual(page_response.natural_text, EDGAR_TEXT)
async def asyncTearDown(self):
pass
# # Shut down the server
# self.my_server_task.cancel()
# with self.assertRaises(asyncio.CancelledError):
# await self.my_server_task
# # Cleanup temporary workspace
# if os.path.exists(self.args.workspace):
# for root, _, files in os.walk(self.args.workspace):
# for file in files:
# os.unlink(os.path.join(root, file))
# os.rmdir(self.args.workspace)
@pytest.mark.nonci
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
# Set up the Hugging Face model and tokenizer
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
download_directory([MODEL_FINETUNED_PATH], model_cache_dir)
# Check the rope config and make sure it's got the proper key
with open(os.path.join(model_cache_dir, "config.json"), "r") as cfin:
config_data = json.load(cfin)
if "rope_type" in config_data["rope_scaling"]:
del config_data["rope_scaling"]["rope_type"]
config_data["rope_scaling"]["type"] = "mrope"
with open(os.path.join(model_cache_dir, "config.json"), "w") as cfout:
json.dump(config_data, cfout)
self.tokenizer = AutoTokenizer.from_pretrained(model_cache_dir, trust_remote_code=True)
self.image_token_id = self.tokenizer.encode("<|image_pad|>")[0]
self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_cache_dir, torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
# Path to the test PDF
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
self.maxDiff = None
async def test_hugging_face_generation(self):
query = await build_page_query(
str(self.test_pdf_path),
page=1,
target_longest_image_dim=1024,
target_anchor_text_len=6000,
)
messages = query["messages"]
# Apply chat template to get the text
text = self.processor.apply_chat_template(query["messages"], tokenize=False, add_generation_prompt=True)
image_url = query["messages"][0]["content"][1]["image_url"]["url"]
# Remove the "data:image/png;base64," prefix
base64_image = image_url.split(",")[1]
# Decode the base64 string into bytes
image_data = base64.b64decode(base64_image)
# Create a BytesIO object and load it into a PIL image
main_image = Image.open(BytesIO(image_data))
# Process inputs using processor
inputs = self.processor(
text=[text],
images=[main_image],
padding=True,
return_tensors="pt",
)
image_indices = [idx for idx, token in enumerate(inputs["input_ids"][0]) if token.item() == self.image_token_id]
print("IMAGE INDICES", image_indices)
print(f"image_grid_thw - {inputs['image_grid_thw'].shape} {inputs['image_grid_thw']}")
print(f"pixel_values - {inputs['pixel_values'].shape} {inputs['pixel_values'].detach().cpu().numpy()}")
np.save("/root/pixel_values.npy", inputs["pixel_values"].detach().cpu().numpy())
inputs = {key: value.to(self.device) for (key, value) in inputs.items()}
generated_tokens = []
max_steps = 50
top_logprobs_hf = []
for step in range(max_steps):
# Generate the output with temperature=0
generation_output = self.model.generate(
**inputs,
temperature=0.0,
max_new_tokens=1,
# max_length=8192,
num_return_sequences=1,
do_sample=False,
output_scores=True,
return_dict_in_generate=True,
)
# Extract the generated token's log probabilities
scores = generation_output.scores # Tuple of length 1
logits = scores[0] # Tensor of shape (batch_size, vocab_size)
log_probs = F.log_softmax(logits, dim=-1) # Apply log softmax to get log probabilities
# Get top 5 tokens and their log probabilities
topk_log_probs, topk_indices = torch.topk(log_probs[0], k=5)
topk_tokens = self.tokenizer.convert_ids_to_tokens(topk_indices.tolist())
top_logprobs_hf.append((topk_tokens, topk_log_probs.tolist()))
# Pick the top token
next_token_id = topk_indices[0].unsqueeze(0).unsqueeze(0) # Shape: (1, 1)
next_token_str = self.tokenizer.convert_ids_to_tokens([next_token_id.item()])[0]
generated_tokens.append(next_token_id.item())
# Append the next token to input_ids and update attention_mask
inputs["input_ids"] = torch.cat([inputs["input_ids"], next_token_id], dim=-1)
inputs["attention_mask"] = torch.cat([inputs["attention_mask"], torch.ones((1, 1), dtype=inputs["attention_mask"].dtype).to(self.device)], dim=-1)
print(self.tokenizer.decode(generated_tokens))
# Now take all the input ids and run them through sglang as a comparison
async with AsyncClient(timeout=600) as session:
query["temperature"] = 0.0
query["max_tokens"] = max_steps
query["logprobs"] = True
query["top_logprobs"] = 5
COMPLETION_URL = f"http://localhost:{30000}/v1/chat/completions"
response = await session.post(COMPLETION_URL, json=query)
response_data = response.json()
for step, lptok in enumerate(response_data["choices"][0]["logprobs"]["content"]):
print("\nTop 5 tokens and their log probabilities:")
(topk_tokens, topk_log_probs) = top_logprobs_hf[step]
for token, log_prob, lptokcur in zip(topk_tokens, topk_log_probs, lptok["top_logprobs"]):
print(
f"HF Token: {token} Log Prob: {log_prob:.2f} Prob {math.exp(log_prob)*100:.2f}% SGLANG Token {lptokcur['token']} Logprob {lptokcur['logprob']:.2f} Prob {math.exp(lptokcur['logprob'])*100:.2f}%"
)
async def asyncTearDown(self):
# Clean up the model and tokenizer
del self.model
del self.tokenizer
torch.cuda.empty_cache()
@pytest.mark.nonci
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
def setUp(self):
# Set up the Hugging Face model and tokenizer
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
download_directory([MODEL_FINETUNED_PATH], model_cache_dir)
# Check the rope config and make sure it's got the proper key
with open(os.path.join(model_cache_dir, "config.json"), "r") as cfin:
config_data = json.load(cfin)
if "rope_type" in config_data["rope_scaling"]:
del config_data["rope_scaling"]["rope_type"]
config_data["rope_scaling"]["type"] = "mrope"
with open(os.path.join(model_cache_dir, "config.json"), "w") as cfout:
json.dump(config_data, cfout)
self.model_cache_dir = model_cache_dir
self.tokenizer = AutoTokenizer.from_pretrained(model_cache_dir, trust_remote_code=True)
self.image_token_id = self.tokenizer.encode("<|image_pad|>")[0]
self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_cache_dir, torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
# Path to the test PDF
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
self.maxDiff = None
async def test_vision_encoder(self):
query = await build_page_query(
str(self.test_pdf_path),
page=1,
target_longest_image_dim=1024,
target_anchor_text_len=6000,
)
messages = query["messages"]
# Apply chat template to get the text
text = self.processor.apply_chat_template(query["messages"], tokenize=False, add_generation_prompt=True)
image_url = query["messages"][0]["content"][1]["image_url"]["url"]
# Remove the "data:image/png;base64," prefix
base64_image = image_url.split(",")[1]
# Decode the base64 string into bytes
image_data = base64.b64decode(base64_image)
# Create a BytesIO object and load it into a PIL image
main_image = Image.open(BytesIO(image_data))
# Process inputs using processor
inputs = self.processor(
text=[text],
images=[main_image],
padding=True,
return_tensors="pt",
)
with torch.no_grad():
hf_output = self.model.visual(inputs["pixel_values"].to(self.device), grid_thw=inputs["image_grid_thw"].to(self.device))
print("HF", hf_output, hf_output.shape)
from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_executor.model_runner import ModelRunner
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
model_config = ModelConfig(self.model_cache_dir, model_override_args="{}")
server_args = ServerArgs(model_path=self.model_cache_dir)
# Initialize model runner
model_runner = ModelRunner(
model_config=model_config,
mem_fraction_static=0.8,
gpu_id=0,
tp_rank=0,
tp_size=1,
nccl_port=12435,
server_args=server_args,
)
print(model_runner)
with torch.no_grad():
sglang_output = model_runner.model.visual(inputs["pixel_values"].to(self.device), grid_thw=inputs["image_grid_thw"].to(self.device))
print("SGLANG", sglang_output, sglang_output.shape)
# Convert to float32 for numerical stability if needed
hf = hf_output.float()
sg = sglang_output.float()
# Basic shape and dtype comparison
print("\n=== Basic Properties ===")
print(f"Shapes match: {hf.shape == sg.shape}")
print(f"HF shape: {hf.shape}, SGLang shape: {sg.shape}")
print(f"HF dtype: {hf.dtype}, SGLang dtype: {sg.dtype}")
# Move tensors to CPU for numpy operations
hf_np = hf.cpu().numpy()
sg_np = sg.cpu().numpy()
# Statistical metrics
print("\n=== Statistical Metrics ===")
print(f"Mean absolute difference: {torch.mean(torch.abs(hf - sg)).item():.6f}")
print(f"Max absolute difference: {torch.max(torch.abs(hf - sg)).item():.6f}")
print(f"Mean squared error: {torch.mean((hf - sg) ** 2).item():.6f}")
print(f"Root mean squared error: {torch.sqrt(torch.mean((hf - sg) ** 2)).item():.6f}")
# Cosine similarity (across feature dimension)
cos_sim = F.cosine_similarity(hf, sg)
print(f"Mean cosine similarity: {torch.mean(cos_sim).item():.6f}")
print(f"Min cosine similarity: {torch.min(cos_sim).item():.6f}")
# Find largest absolute differences
print("\n=== Largest Absolute Differences ===")
diffs = torch.abs(hf - sg)
flat_diffs = diffs.flatten()
# Get indices of top 10 differences
top_k = 10
top_values, top_flat_indices = torch.topk(flat_diffs, top_k)
# Convert flat indices to multidimensional indices
top_indices = np.unravel_index(top_flat_indices.cpu().numpy(), diffs.shape)
print(f"\nTop {top_k} largest absolute differences:")
print("Index".ljust(30) + "Difference".ljust(15) + "HF Value".ljust(15) + "SGLang Value")
print("-" * 75)
for i in range(top_k):
# Get the index tuple for this difference
idx = tuple(dim[i] for dim in top_indices)
diff_val = top_values[i].item()
hf_val = hf[idx].item()
sg_val = sg[idx].item()
# Format the index tuple and values
idx_str = str(idx)
print(f"{idx_str:<30}{diff_val:<15.6f}{hf_val:<15.6f}{sg_val:.6f}")