mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-17 11:12:33 +00:00
Merge branch 'main' into jakep/new_trainer
This commit is contained in:
commit
b96454b786
37
.github/workflows/main.yml
vendored
37
.github/workflows/main.yml
vendored
@ -271,25 +271,26 @@ jobs:
|
||||
outputs: type=registry
|
||||
no-cache: true
|
||||
|
||||
- name: Setup Beaker CLI
|
||||
uses: allenai/setup-beaker@v2
|
||||
with:
|
||||
token: ${{ secrets.BEAKER_TOKEN }}
|
||||
version: latest
|
||||
# jakep: push to beaker can't work because of limitted disk space on these runners
|
||||
# jakep: (you can try by setting load: true above, but you'll need a larger runner)
|
||||
# - name: Setup Beaker CLI
|
||||
# uses: allenai/setup-beaker@v2
|
||||
# with:
|
||||
# token: ${{ secrets.BEAKER_TOKEN }}
|
||||
# version: latest
|
||||
# - name: Debug Docker images
|
||||
# run: docker images
|
||||
|
||||
- name: Push to Beaker
|
||||
env:
|
||||
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
|
||||
run: |
|
||||
# Get the version without 'v' prefix
|
||||
VERSION=${GITHUB_REF#refs/tags/v}
|
||||
|
||||
# Push the Docker image to Beaker
|
||||
beaker image create \
|
||||
--name "olmocr-inference-$VERSION" \
|
||||
--workspace ai2/olmocr \
|
||||
"docker://${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$VERSION"
|
||||
|
||||
# - name: Push to Beaker
|
||||
# env:
|
||||
# BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
|
||||
# run: |
|
||||
# VERSION=${{ steps.meta.outputs.version }}
|
||||
# beaker image create \
|
||||
# --name "olmocr-inference-$VERSION" \
|
||||
# --workspace ai2/olmocr \
|
||||
# alleninstituteforai/olmocr:$VERSION
|
||||
|
||||
- name: Clean up after build
|
||||
if: always()
|
||||
run: |
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -21,6 +21,7 @@ olmOCR-bench/*
|
||||
table_data*/
|
||||
/synth*/
|
||||
dolma_samples/*
|
||||
old_train/
|
||||
/*.html
|
||||
scoreelo.csv
|
||||
debug.log
|
||||
|
10
CHANGELOG.md
10
CHANGELOG.md
@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## Unreleased
|
||||
|
||||
## [v0.1.76](https://github.com/allenai/olmocr/releases/tag/v0.1.76) - 2025-06-23
|
||||
|
||||
## [v0.1.75](https://github.com/allenai/olmocr/releases/tag/v0.1.75) - 2025-06-17
|
||||
|
||||
## [v0.1.74](https://github.com/allenai/olmocr/releases/tag/v0.1.74) - 2025-06-17
|
||||
|
||||
## [v0.1.73](https://github.com/allenai/olmocr/releases/tag/v0.1.73) - 2025-06-17
|
||||
|
||||
## [v0.1.72](https://github.com/allenai/olmocr/releases/tag/v0.1.72) - 2025-06-17
|
||||
|
||||
## [v0.1.71](https://github.com/allenai/olmocr/releases/tag/v0.1.71) - 2025-05-30
|
||||
|
||||
## [v0.1.70](https://github.com/allenai/olmocr/releases/tag/v0.1.70) - 2025-05-23
|
||||
|
14
Dockerfile
14
Dockerfile
@ -47,19 +47,19 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
|
||||
unzip
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /root
|
||||
COPY pyproject.toml pyproject.toml
|
||||
COPY olmocr/version.py olmocr/version.py
|
||||
|
||||
# keep the build context clean
|
||||
WORKDIR /build
|
||||
COPY . /build
|
||||
|
||||
|
||||
# Needed to resolve setuptools dependencies
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
RUN uv pip install --system --no-cache -e ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
|
||||
RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
|
||||
RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
|
||||
RUN uv pip install --system --no-cache ".[bench]"
|
||||
|
||||
RUN playwright install-deps
|
||||
RUN playwright install chromium
|
||||
|
||||
COPY olmocr olmocr
|
||||
COPY scripts scripts
|
||||
|
||||
RUN python3 -m olmocr.pipeline --help
|
65
README.md
65
README.md
@ -35,6 +35,7 @@ Features:
|
||||
- (Based on a 7B parameter VLM, so it requires a GPU)
|
||||
|
||||
### News
|
||||
- June 17, 2025 - v0.1.75 - Switch from sglang to vllm based inference pipeline, updated docker image to CUDA 12.8.
|
||||
- May 23, 2025 - v0.1.70 - Official docker support and images are now available! [See Docker usage](#using-docker)
|
||||
- May 19, 2025 - v0.1.68 - [olmOCR-Bench](https://github.com/allenai/olmocr/tree/main/olmocr/bench) launch, scoring 77.4. Launch includes 2 point performance boost in olmOCR pipeline due to bug fixes with prompts.
|
||||
- Mar 17, 2025 - v0.1.60 - Performance improvements due to better temperature selection in sampling.
|
||||
@ -49,29 +50,29 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
|
||||
<thead>
|
||||
<tr>
|
||||
<th align="left"><strong>Model</strong></th>
|
||||
<th align="center">AR</th>
|
||||
<th align="center">OSM</th>
|
||||
<th align="center">TA</th>
|
||||
<th align="center">OS</th>
|
||||
<th align="center">HF</th>
|
||||
<th align="center">MC</th>
|
||||
<th align="center">LTT</th>
|
||||
<th align="center">ArXiv</th>
|
||||
<th align="center">Old Scans Math</th>
|
||||
<th align="center">Tables</th>
|
||||
<th align="center">Old Scans</th>
|
||||
<th align="center">Headers and Footers</th>
|
||||
<th align="center">Multi column</th>
|
||||
<th align="center">Long tiny text</th>
|
||||
<th align="center">Base</th>
|
||||
<th align="center">Overall Score</th>
|
||||
<th align="center">Overall</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td align="left">Marker v1.6.2</td>
|
||||
<td align="center">24.3</td>
|
||||
<td align="center">22.1</td>
|
||||
<td align="center">69.8</td>
|
||||
<td align="center">24.3</td>
|
||||
<td align="center">87.1</td>
|
||||
<td align="center">71.0</td>
|
||||
<td align="center">76.9</td>
|
||||
<td align="center"><strong>99.5</strong></td>
|
||||
<td align="center">59.4 ± 1.1</td>
|
||||
<td align="left">Marker v1.7.5 (base)</td>
|
||||
<td align="center">76.0</td>
|
||||
<td align="center">57.9</td>
|
||||
<td align="center">57.6</td>
|
||||
<td align="center">27.8</td>
|
||||
<td align="center">84.9</td>
|
||||
<td align="center">72.9</td>
|
||||
<td align="center">84.6</td>
|
||||
<td align="center">99.1</td>
|
||||
<td align="center">70.1 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">MinerU v1.3.10</td>
|
||||
@ -94,24 +95,25 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
|
||||
<td align="center">93.6</td>
|
||||
<td align="center">71.3</td>
|
||||
<td align="center">77.1</td>
|
||||
<td align="center">99.4</td>
|
||||
<td align="center"><strong>99.4</strong></td>
|
||||
<td align="center">72.0 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">olmOCR v0.1.68 (pipeline.py)</td>
|
||||
<td align="center">75.6</td>
|
||||
<td align="center">75.1</td>
|
||||
<td align="center">70.2</td>
|
||||
<td align="center"><strong>44.5</strong></td>
|
||||
<td align="center">93.4</td>
|
||||
<td align="center"><strong>79.4</strong></td>
|
||||
<td align="center">81.7</td>
|
||||
<td align="center">99.0</td>
|
||||
<td align="center"><strong>77.4 ± 1.0</strong></td>
|
||||
<td align="left">olmOCR v0.1.75 (Anchored)</td>
|
||||
<td align="center">74.9</td>
|
||||
<td align="center">71.2</td>
|
||||
<td align="center">71.0</td>
|
||||
<td align="center">42.2</td>
|
||||
<td align="center">94.5</td>
|
||||
<td align="center"><strong>78.3</strong></td>
|
||||
<td align="center">73.3</td>
|
||||
<td align="center">98.3</td>
|
||||
<td align="center"><strong>75.5 ± 1.0</strong></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
### Installation
|
||||
|
||||
Requirements:
|
||||
@ -136,7 +138,10 @@ conda activate olmocr
|
||||
pip install olmocr[bench]
|
||||
|
||||
# For actually converting the files with your own GPU
|
||||
pip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
|
||||
pip install olmocr.[gpu] --extra-index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
# Recommended: Install flash infer for faster inference on GPU
|
||||
pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
|
||||
```
|
||||
|
||||
### Local Usage Example
|
||||
|
@ -14,6 +14,9 @@ olmOCR-bench operates on single page PDFs directly. We make this choice because
|
||||
We have run the benchmark against some contemporary OCR pipelines, but it is really easy
|
||||
to run it against your own OCR tools. Your tool just needs to support Markdown or plain text output.
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/allenai/olmocr/blob/main/scripts/pareto/ocr_pareto.png?raw=true" width=800/>
|
||||
</div>
|
||||
|
||||
## Results
|
||||
|
||||
@ -37,7 +40,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
|
||||
<td align="left">GOT OCR</td>
|
||||
<td align="center">52.7</td>
|
||||
<td align="center">52.0</td>
|
||||
<td align="center">0.2</td>
|
||||
<td align="center">0.20</td>
|
||||
<td align="center">22.1</td>
|
||||
<td align="center">93.6</td>
|
||||
<td align="center">42.0</td>
|
||||
@ -46,16 +49,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
|
||||
<td align="center">48.3 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">Marker v1.6.2</td>
|
||||
<td align="center">24.3</td>
|
||||
<td align="center">22.1</td>
|
||||
<td align="center">69.8</td>
|
||||
<td align="center">24.3</td>
|
||||
<td align="center">87.1</td>
|
||||
<td align="center">71.0</td>
|
||||
<td align="center">76.9</td>
|
||||
<td align="center"><strong>99.5</strong></td>
|
||||
<td align="center">59.4 ± 1.1</td>
|
||||
<td align="left">Marker v1.7.5 (base, force_ocr)</td>
|
||||
<td align="center">76.0</td>
|
||||
<td align="center">57.9</td>
|
||||
<td align="center">57.6</td>
|
||||
<td align="center">27.8</td>
|
||||
<td align="center">84.9</td>
|
||||
<td align="center">72.9</td>
|
||||
<td align="center">84.6</td>
|
||||
<td align="center">99.1</td>
|
||||
<td align="center">70.1 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">MinerU v1.3.10</td>
|
||||
@ -78,9 +81,21 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
|
||||
<td align="center">93.6</td>
|
||||
<td align="center">71.3</td>
|
||||
<td align="center">77.1</td>
|
||||
<td align="center">99.4</td>
|
||||
<td align="center"><strong>99.4</strong></td>
|
||||
<td align="center">72.0 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">Nanonets OCR</td>
|
||||
<td align="center">67.0</td>
|
||||
<td align="center">68.6</td>
|
||||
<td align="center"><strong>77.7</strong></td>
|
||||
<td align="center">39.5</td>
|
||||
<td align="center">40.7</td>
|
||||
<td align="center">69.9</td>
|
||||
<td align="center">53.4</td>
|
||||
<td align="center">99.3</td>
|
||||
<td align="center">64.5 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">GPT-4o (No Anchor)</td>
|
||||
<td align="center">51.5</td>
|
||||
@ -154,33 +169,39 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
|
||||
<td align="center">65.5 ± 1.2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">olmOCR v0.1.68 (No Anchor)</td>
|
||||
<td align="center">72.1</td>
|
||||
<td align="center">74.7</td>
|
||||
<td align="left">olmOCR v0.1.75 (No Anchor)</td>
|
||||
<td align="center">71.5</td>
|
||||
<td align="center">43.7</td>
|
||||
<td align="center">91.6</td>
|
||||
<td align="center">78.5</td>
|
||||
<td align="center">80.5</td>
|
||||
<td align="center">98.1</td>
|
||||
<td align="center">76.3 ± 1.1</td>
|
||||
<td align="center">71.4</td>
|
||||
<td align="center">71.4</td>
|
||||
<td align="center"><strong>42.8</strong></td>
|
||||
<td align="center">94.1</td>
|
||||
<td align="center">77.7</td>
|
||||
<td align="center">71.0</td>
|
||||
<td align="center">97.8</td>
|
||||
<td align="center">74.7 ± 1.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="left">olmOCR v0.1.68 (Anchored)</td>
|
||||
<td align="center">75.6</td>
|
||||
<td align="center">75.1</td>
|
||||
<td align="center">70.2</td>
|
||||
<td align="center"><strong>44.5</strong></td>
|
||||
<td align="center">93.4</td>
|
||||
<td align="center"><strong>79.4</strong></td>
|
||||
<td align="center">81.7</td>
|
||||
<td align="center">99.0</td>
|
||||
<td align="center"><strong>77.4 ± 1.0</strong></td>
|
||||
<td align="left">olmOCR v0.1.75 (Anchored)</td>
|
||||
<td align="center">74.9</td>
|
||||
<td align="center">71.2</td>
|
||||
<td align="center">71.0</td>
|
||||
<td align="center">42.2</td>
|
||||
<td align="center">94.5</td>
|
||||
<td align="center"><strong>78.3</strong></td>
|
||||
<td align="center">73.3</td>
|
||||
<td align="center">98.3</td>
|
||||
<td align="center"><strong>75.5 ± 1.0</strong></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
<sup><sub>There was a small drop in scores from olmOCR v0.1.68 (77.4), which is due to two factors. One, is that we have adjusted our benchmark code to not include
|
||||
any "fallback" mechanism when measuring benchmark scores (though it still exists when you run olmocr.pipeline). Second, there is a small drop in scores as we have updated
|
||||
from sglang 0.4.2 to vllm 0.9.1. In net, we think the upgrade to vllm is the right choice, given that sglang 0.4.6 had even lower scores by one point, and vllm comes with a
|
||||
small performance boost, and great support for quantization.
|
||||
</sub></sup>
|
||||
|
||||
## Sourcing Documents and Tests
|
||||
|
||||
We define 7 distinct document types that we found olmOCR (or its earlier iterations) often struggled to process and defined custom acquisition strategies for each (described below). We removed documents that both contained PII and were not meant for public dissemination. We also decontaminate against documents that appear in olmOCR-Mix via URL level deduplication. To scale creation of test cases over these documents, we combined manual design and review with prompting GPT-4o.
|
||||
@ -288,6 +309,3 @@ We have an internal data annotation tool that can be used to review the question
|
||||
```bash
|
||||
python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
@ -223,6 +223,7 @@ if __name__ == "__main__":
|
||||
available_methods = {
|
||||
"olmocr_pipeline": ("olmocr.bench.runners.run_olmocr_pipeline", "run_olmocr_pipeline"),
|
||||
"gotocr": ("olmocr.bench.runners.run_gotocr", "run_gotocr"),
|
||||
"nanonetsocr": ("olmocr.bench.runners.run_nanonetsocr", "run_nanonetsocr"),
|
||||
"marker": ("olmocr.bench.runners.run_marker", "run_marker"),
|
||||
"mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
|
||||
"chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from marker.config.parser import ConfigParser
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.output import text_from_rendered
|
||||
@ -15,10 +16,22 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str:
|
||||
if _marker_converter is None:
|
||||
# Create a configuration dictionary with the necessary settings
|
||||
config = {
|
||||
"texify_inline_spans": True, # This enables conversion of inline math to LaTeX
|
||||
"force_ocr": True, # This enables conversion of inline math to LaTeX
|
||||
"use_llm": False, # We would prefer to run just plain marker for reporting bench results, not hybrid mode
|
||||
"disable_tqdm": True, # Disable tqdm for cleaner output
|
||||
"recognition_batch_size": 256,
|
||||
"layout_batch_size": 48,
|
||||
"detection_batch_size": 48,
|
||||
"equation_batch_size": 64,
|
||||
"table_rec_batch_size": 48,
|
||||
"ocr_error_batch_size": 64,
|
||||
}
|
||||
config_parser = ConfigParser(config)
|
||||
|
||||
_marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config)
|
||||
_marker_converter = PdfConverter(
|
||||
artifact_dict=create_model_dict(),
|
||||
config=config_parser.generate_config_dict(),
|
||||
)
|
||||
|
||||
# Extract the specific page from the PDF
|
||||
pdf_to_process = pdf_path
|
||||
|
89
olmocr/bench/runners/run_nanonetsocr.py
Normal file
89
olmocr/bench/runners/run_nanonetsocr.py
Normal file
@ -0,0 +1,89 @@
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
_model = None
|
||||
_tokenizer = None
|
||||
_processor = None
|
||||
_device = None
|
||||
|
||||
|
||||
def load_model(model_path: str = "nanonets/Nanonets-OCR-s"):
|
||||
global _model, _tokenizer, _processor, _device
|
||||
|
||||
if _model is None:
|
||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
_model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_path,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
# attn_implementation="flash_attention_2"
|
||||
)
|
||||
_model.eval()
|
||||
_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
_processor = AutoProcessor.from_pretrained(model_path)
|
||||
|
||||
return _model, _tokenizer, _processor
|
||||
|
||||
|
||||
async def run_nanonetsocr(pdf_path: str, page_num: int = 1, model_path: str = "nanonets/Nanonets-OCR-s", max_new_tokens: int = 4096, **kwargs) -> str:
|
||||
"""
|
||||
Convert page of a PDF file to markdown using NANONETS-OCR.
|
||||
|
||||
This function renders the first page of the PDF to an image, runs OCR on that image,
|
||||
and returns the OCR result as a markdown-formatted string.
|
||||
|
||||
Args:
|
||||
pdf_path (str): The local path to the PDF file.
|
||||
|
||||
Returns:
|
||||
str: The OCR result in markdown format.
|
||||
"""
|
||||
|
||||
model, tokenizer, processor = load_model(model_path)
|
||||
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=1024)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
||||
image_data = base64.b64decode(image_base64)
|
||||
temp_file.write(image_data)
|
||||
temp_image_path = temp_file.name
|
||||
|
||||
try:
|
||||
image = Image.open(temp_image_path)
|
||||
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": f"file://{temp_image_path}"},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
},
|
||||
]
|
||||
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt", use_fast=True)
|
||||
inputs = inputs.to(model.device)
|
||||
with torch.no_grad():
|
||||
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
||||
|
||||
generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
||||
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
cleaned_text = re.sub(r"<page_number>\d+</page_number>", "", output_text[0])
|
||||
|
||||
return cleaned_text
|
||||
|
||||
finally:
|
||||
try:
|
||||
os.unlink(temp_image_path)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to remove temporary file {temp_image_path}: {e}")
|
@ -123,6 +123,8 @@ def normalize_text(md_content: str) -> str:
|
||||
# Remove markdown bold formatting (** or __ for bold)
|
||||
md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
|
||||
md_content = re.sub(r"__(.*?)__", r"\1", md_content)
|
||||
md_content = re.sub(r"</?b>", "", md_content) # Remove <b> tags if they exist
|
||||
md_content = re.sub(r"</?i>", "", md_content) # Remove <i> tags if they exist
|
||||
|
||||
# Remove markdown italics formatting (* or _ for italics)
|
||||
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
|
||||
|
@ -1,86 +0,0 @@
|
||||
import json
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
|
||||
import boto3
|
||||
from tqdm import tqdm
|
||||
|
||||
# Configuration
|
||||
BUCKET = "ai2-llm"
|
||||
PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results"
|
||||
OUTPUT_FILENAME = "all_completed_files.txt"
|
||||
|
||||
|
||||
def process_file(key: str):
|
||||
"""
|
||||
Process a single S3 file given by its key.
|
||||
Reads a jsonl file from S3, decodes each line,
|
||||
extracts the 'Source-File' from the 'metadata' field,
|
||||
and returns a list of these source file strings.
|
||||
"""
|
||||
# Create a new S3 client in the worker thread (thread-safe)
|
||||
s3 = boto3.client("s3")
|
||||
extracted_lines = []
|
||||
try:
|
||||
response = s3.get_object(Bucket=BUCKET, Key=key)
|
||||
for raw_line in response["Body"].iter_lines():
|
||||
try:
|
||||
# Decode the line from bytes to text
|
||||
line_str = raw_line.decode("utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Skipping a line in {key} due to decode error: {e}")
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line_str)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Skipping a malformed json line in {key}: {e}")
|
||||
continue
|
||||
# Extract 'Source-File' from metadata if present
|
||||
metadata = data.get("metadata", {})
|
||||
source_file = metadata.get("Source-File")
|
||||
if source_file:
|
||||
extracted_lines.append(source_file)
|
||||
except Exception as e:
|
||||
print(f"Error processing file {key}: {e}")
|
||||
return extracted_lines
|
||||
|
||||
|
||||
def main():
|
||||
s3 = boto3.client("s3")
|
||||
paginator = s3.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
|
||||
|
||||
# Gather all S3 object keys under the specified prefix
|
||||
keys = []
|
||||
for page in page_iterator:
|
||||
if "Contents" not in page:
|
||||
continue
|
||||
for obj in page["Contents"]:
|
||||
keys.append(obj["Key"])
|
||||
|
||||
print(f"Found {len(keys)} files to process.")
|
||||
|
||||
# Open the output file for writing
|
||||
with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file:
|
||||
# Create a thread pool to process files concurrently.
|
||||
# Adjust max_workers based on your environment and workload.
|
||||
with ProcessPoolExecutor() as executor:
|
||||
# Submit all processing jobs and map each future to its key
|
||||
future_to_key = {executor.submit(process_file, key): key for key in keys}
|
||||
# Use tqdm to wrap the as_completed iterator for progress display
|
||||
for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"):
|
||||
try:
|
||||
source_files = future.result()
|
||||
# Write each extracted line to the output file as soon as the future completes
|
||||
for source in source_files:
|
||||
output_file.write(source + "\n")
|
||||
# Optionally flush after each completed task
|
||||
output_file.flush()
|
||||
except Exception as e:
|
||||
key = future_to_key[future]
|
||||
print(f"Exception occurred for file {key}: {e}")
|
||||
|
||||
print(f"Finished writing the source file names to {OUTPUT_FILENAME}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -32,13 +32,12 @@ from tqdm import tqdm
|
||||
|
||||
from olmocr.check import (
|
||||
check_poppler_version,
|
||||
check_sglang_version,
|
||||
check_torch_gpu_available,
|
||||
)
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from olmocr.filter.filter import Language, PdfFilter
|
||||
from olmocr.image_utils import convert_image_to_pdf_bytes, is_jpeg, is_png
|
||||
from olmocr.metrics import MetricsKeeper, WorkerTracker, cpu_vs_wall
|
||||
from olmocr.metrics import MetricsKeeper, WorkerTracker
|
||||
from olmocr.prompts import PageResponse, build_finetuning_prompt
|
||||
from olmocr.prompts.anchor import get_anchor_text
|
||||
from olmocr.s3_utils import (
|
||||
@ -331,7 +330,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
|
||||
|
||||
|
||||
async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
|
||||
try:
|
||||
data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path))
|
||||
tf.write(data)
|
||||
@ -349,6 +348,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
tf.write(convert_image_to_pdf_bytes(tf.name))
|
||||
tf.flush()
|
||||
|
||||
try:
|
||||
try:
|
||||
reader = PdfReader(tf.name)
|
||||
num_pages = reader.get_num_pages()
|
||||
@ -400,6 +400,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
# You can't build a dolma doc with even 1 failed page, so just get out of here
|
||||
# However, you don't want to propagate an exception higher up and cancel the entire work_group
|
||||
return None
|
||||
finally:
|
||||
if os.path.exists(tf.name):
|
||||
os.unlink(tf.name)
|
||||
|
||||
|
||||
def build_dolma_document(pdf_orig_path, page_results):
|
||||
@ -705,19 +708,31 @@ async def vllm_server_ready():
|
||||
raise Exception("vllm server did not become ready after waiting.")
|
||||
|
||||
|
||||
async def download_model(model_name_or_path: str):
|
||||
if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
|
||||
logger.info(f"Downloading model directory from '{model_name_or_path}'")
|
||||
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
|
||||
download_directory([model_name_or_path], model_cache_dir)
|
||||
return model_cache_dir
|
||||
elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
|
||||
logger.info(f"Using local model path at '{model_name_or_path}'")
|
||||
return model_name_or_path
|
||||
else:
|
||||
logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
|
||||
snapshot_download(repo_id=model_name_or_path)
|
||||
return model_name_or_path
|
||||
async def download_model(model_name_or_path: str, max_retries: int = 5):
|
||||
for retry in range(max_retries):
|
||||
try:
|
||||
if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
|
||||
logger.info(f"Downloading model directory from '{model_name_or_path}'")
|
||||
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
|
||||
# Delete existing model cache directory if it exists
|
||||
if os.path.exists(model_cache_dir):
|
||||
shutil.rmtree(model_cache_dir)
|
||||
download_directory([model_name_or_path], model_cache_dir)
|
||||
return model_cache_dir
|
||||
elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
|
||||
logger.info(f"Using local model path at '{model_name_or_path}'")
|
||||
return model_name_or_path
|
||||
else:
|
||||
logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
|
||||
snapshot_download(repo_id=model_name_or_path)
|
||||
return model_name_or_path
|
||||
except Exception:
|
||||
if retry == max_retries - 1:
|
||||
raise # Raise on final attempt and fail the job
|
||||
|
||||
sleep_time = random.randrange(2, 20) * 2**retry
|
||||
logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})")
|
||||
await asyncio.sleep(random.randrange(10, 30) * 2**retry)
|
||||
|
||||
|
||||
async def metrics_reporter(work_queue):
|
||||
@ -906,6 +921,7 @@ def print_stats(args, root_work_queue):
|
||||
logger.warning(f"Error processing {s3_path}: {e}")
|
||||
return 0, 0, 0, 0, 0, set(), 0, 0
|
||||
|
||||
print(f"\nCompleted work items {completed_items:,} out of {total_items:,}: {completed_items/total_items*100:.2f}%")
|
||||
print("\nProcessing output files...")
|
||||
docs_total = 0
|
||||
input_tokens_total = 0
|
||||
@ -1033,8 +1049,8 @@ async def main():
|
||||
|
||||
# Wait a little bit so that not all beaker jobs in a task start at the same time and download the model at the same time
|
||||
replica_count = int(os.environ.get("BEAKER_REPLICA_COUNT", "1"))
|
||||
interval = 10 if (replica_count - 1) * 10 <= 240 else 240 / max(1, replica_count - 1)
|
||||
sleep_time = int(int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval)
|
||||
interval = 10 if (replica_count - 1) * 10 <= 30 else 30 / max(1, replica_count - 1)
|
||||
sleep_time = int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval
|
||||
logger.info(f"Beaker job sleeping for {sleep_time} seconds to stagger model downloads")
|
||||
await asyncio.sleep(sleep_time)
|
||||
|
||||
@ -1155,7 +1171,6 @@ async def main():
|
||||
await vllm_server_ready()
|
||||
|
||||
metrics_task = asyncio.create_task(metrics_reporter(work_queue))
|
||||
cpu_monitor_task = asyncio.create_task(cpu_vs_wall(10))
|
||||
|
||||
# Create worker tasks to process the queue concurrently.
|
||||
worker_tasks = []
|
||||
@ -1171,7 +1186,6 @@ async def main():
|
||||
|
||||
vllm_server.cancel()
|
||||
metrics_task.cancel()
|
||||
cpu_monitor_task.cancel()
|
||||
|
||||
# Output final metrics summary
|
||||
metrics_summary = metrics.get_metrics_summary()
|
||||
@ -1201,14 +1215,14 @@ async def main():
|
||||
logger.info(f"Server Input tokens/sec rate: {rates['server_input_tokens_per_sec']:.2f}")
|
||||
if "server_output_tokens_per_sec" in rates:
|
||||
logger.info(f"Server Output tokens/sec rate: {rates['server_output_tokens_per_sec']:.2f}")
|
||||
if "finished_input_tokens" in rates:
|
||||
logger.info(f"Finished Input tokens/sec rate: {rates['finished_input_tokens']:.2f}")
|
||||
if "finished_output_tokens" in rates:
|
||||
logger.info(f"Finished Output tokens/sec rate: {rates['finished_output_tokens']:.2f}")
|
||||
if "finished_input_tokens_per_sec" in rates:
|
||||
logger.info(f"Finished Input tokens/sec rate: {rates['finished_input_tokens_per_sec']:.2f}")
|
||||
if "finished_output_tokens_per_sec" in rates:
|
||||
logger.info(f"Finished Output tokens/sec rate: {rates['finished_output_tokens_per_sec']:.2f}")
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("Work done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
28
olmocr/train/compressqwen2checkpoint.py
Normal file
28
olmocr/train/compressqwen2checkpoint.py
Normal file
@ -0,0 +1,28 @@
|
||||
# pip install llmcompressor
|
||||
from llmcompressor import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration
|
||||
|
||||
MODEL_ID = "/home/ubuntu/olmocr/olmOCR-7B-0225-preview"
|
||||
|
||||
model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Configure the simple PTQ quantization
|
||||
# recipe = QuantizationModifier(
|
||||
# targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
|
||||
# Configure pre-defined qwen2vl recipe
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear",
|
||||
scheme="FP8_DYNAMIC",
|
||||
ignore=["re:.*lm_head", "re:visual.*"],
|
||||
)
|
||||
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
||||
# Save the model.
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic-Recipe"
|
||||
model.save_pretrained(SAVE_DIR)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "71"
|
||||
_PATCH = "76"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
|
||||
echo "$VERSION"
|
||||
|
||||
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION .
|
||||
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
|
||||
|
||||
docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging -t olmocr-tagging-$VERSION .
|
||||
beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION
|
BIN
scripts/pareto/ocr_pareto.pdf
Normal file
BIN
scripts/pareto/ocr_pareto.pdf
Normal file
Binary file not shown.
BIN
scripts/pareto/ocr_pareto.png
Normal file
BIN
scripts/pareto/ocr_pareto.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 294 KiB |
@ -64,12 +64,12 @@ data = {
|
||||
"MinerU",
|
||||
"Gemini Flash 2",
|
||||
"Gemini Flash 2 (Batch)",
|
||||
"Marker v1.6.2",
|
||||
"Marker v1.7.5",
|
||||
"Ours",
|
||||
"Qwen 2 VL",
|
||||
"Qwen 2.5 VL",
|
||||
],
|
||||
COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178], # Same cost as Ours # Same cost as Ours
|
||||
COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 1492, 178, 178, 178], # Same cost as Ours # Same cost as Ours
|
||||
PERF_COLUMN_NAME: [
|
||||
69.9, # GPT-4o (Anchored)
|
||||
69.9, # Same performance for batch
|
||||
@ -77,8 +77,8 @@ data = {
|
||||
61.5, # MinerU
|
||||
63.8, # Gemini Flash 2 (Anchored)
|
||||
63.8, # Same performance for batch
|
||||
59.4, # marker v1.6.2
|
||||
77.4, # Ours (performance is the same across hardware)
|
||||
70.1, # marker v1.7.5 base
|
||||
75.5, # Ours (performance is the same across hardware)
|
||||
31.5, # Qwen2VL
|
||||
65.5, # Qwen2.5VL
|
||||
],
|
||||
@ -94,7 +94,7 @@ model_categories = {
|
||||
"MinerU": "Open Source Tool",
|
||||
"Gemini Flash 2": "Commercial VLM",
|
||||
"Gemini Flash 2 (Batch)": "Commercial VLM",
|
||||
"Marker v1.6.2": "Open Source Tool",
|
||||
"Marker v1.7.5": "Open Source Tool",
|
||||
"Ours": "Ours",
|
||||
"Qwen 2 VL": "Open VLM",
|
||||
"Qwen 2.5 VL": "Open VLM",
|
||||
@ -131,8 +131,8 @@ model_label_offsets = {
|
||||
"Mistral OCR": [-20, 10],
|
||||
"MinerU": [-15, -20],
|
||||
"Gemini Flash 2": [-10, 10],
|
||||
"Gemini Flash 2 (Batch)": [-50, -15],
|
||||
"Marker v1.6.2": [-35, -20],
|
||||
"Gemini Flash 2 (Batch)": [-50, -20],
|
||||
"Marker v1.7.5": [-25, -20],
|
||||
"Ours": [-20, 10],
|
||||
"Qwen 2 VL": [-35, 10],
|
||||
"Qwen 2.5 VL": [-35, 10],
|
202
scripts/run_marker_benchmark.sh
Executable file
202
scripts/run_marker_benchmark.sh
Executable file
@ -0,0 +1,202 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Runs marker benchmark, measuring both olmOCR-bench performance and per document processing performance
|
||||
# ./scripts/run_marker_benchmark.sh
|
||||
# ./scripts/run_marker_benchmark.sh 1.7.5
|
||||
|
||||
set -e
|
||||
|
||||
# Parse command line arguments
|
||||
MARKER_VERSION="${1:-1.7.5}"
|
||||
echo "Using marker version: $MARKER_VERSION"
|
||||
|
||||
# Check for uncommitted changes
|
||||
if ! git diff-index --quiet HEAD --; then
|
||||
echo "Error: There are uncommitted changes in the repository."
|
||||
echo "Please commit or stash your changes before running the benchmark."
|
||||
echo ""
|
||||
echo "Uncommitted changes:"
|
||||
git status --short
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Use conda environment Python if available, otherwise use system Python
|
||||
if [ -n "$CONDA_PREFIX" ]; then
|
||||
PYTHON="$CONDA_PREFIX/bin/python"
|
||||
echo "Using conda Python from: $CONDA_PREFIX"
|
||||
else
|
||||
PYTHON="python"
|
||||
echo "Warning: No conda environment detected, using system Python"
|
||||
fi
|
||||
|
||||
# Get version from version.py
|
||||
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
|
||||
echo "OlmOCR version: $VERSION"
|
||||
|
||||
# Get first 10 characters of git hash
|
||||
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
|
||||
echo "Git hash: $GIT_HASH"
|
||||
|
||||
# Get current git branch name
|
||||
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
echo "Git branch: $GIT_BRANCH"
|
||||
|
||||
# Create full image tag
|
||||
IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
|
||||
echo "Building Docker image with tag: $IMAGE_TAG"
|
||||
|
||||
# Build the Docker image
|
||||
echo "Building Docker image..."
|
||||
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
|
||||
|
||||
# Get Beaker username
|
||||
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
|
||||
echo "Beaker user: $BEAKER_USER"
|
||||
|
||||
# Push image to beaker
|
||||
echo "Trying to push image to Beaker..."
|
||||
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
|
||||
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
|
||||
fi
|
||||
|
||||
# Create Python script to run beaker experiment
|
||||
cat << 'EOF' > /tmp/run_benchmark_experiment.py
|
||||
import sys
|
||||
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
|
||||
|
||||
# Get image tag, beaker user, git branch, git hash, and marker version from command line
|
||||
image_tag = sys.argv[1]
|
||||
beaker_user = sys.argv[2]
|
||||
git_branch = sys.argv[3]
|
||||
git_hash = sys.argv[4]
|
||||
marker_version = sys.argv[5]
|
||||
|
||||
# Initialize Beaker client
|
||||
b = Beaker.from_env(default_workspace="ai2/olmocr")
|
||||
|
||||
|
||||
# Check if AWS credentials secret exists
|
||||
aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
|
||||
try:
|
||||
# Try to get the secret to see if it exists
|
||||
b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
|
||||
has_aws_creds = True
|
||||
print(f"Found AWS credentials secret: {aws_creds_secret}")
|
||||
except:
|
||||
has_aws_creds = False
|
||||
print(f"AWS credentials secret not found: {aws_creds_secret}")
|
||||
|
||||
# First experiment: Original benchmark job
|
||||
commands = []
|
||||
if has_aws_creds:
|
||||
commands.extend([
|
||||
"mkdir -p ~/.aws",
|
||||
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
|
||||
])
|
||||
commands.extend([
|
||||
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
|
||||
"cd olmOCR-bench && git lfs pull && cd ..",
|
||||
f"pip install marker-pdf=={marker_version}",
|
||||
"pip install --upgrade torchvision",
|
||||
"python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data",
|
||||
"python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
|
||||
])
|
||||
|
||||
# Build task spec with optional env vars
|
||||
task_spec_args = {
|
||||
"name": "marker-benchmark",
|
||||
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
|
||||
"command": [
|
||||
"bash", "-c",
|
||||
" && ".join(commands)
|
||||
],
|
||||
"context": TaskContext(
|
||||
priority=Priority.normal,
|
||||
preemptible=True,
|
||||
),
|
||||
"resources": TaskResources(gpu_count=1),
|
||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
"result": ResultSpec(path="/noop-results"),
|
||||
}
|
||||
|
||||
# Add env vars if AWS credentials exist
|
||||
if has_aws_creds:
|
||||
task_spec_args["env_vars"] = [
|
||||
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
|
||||
]
|
||||
|
||||
# Create first experiment spec
|
||||
experiment_spec = ExperimentSpec(
|
||||
description=f"Marker {marker_version} Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
|
||||
budget="ai2/oe-data",
|
||||
tasks=[TaskSpec(**task_spec_args)],
|
||||
)
|
||||
|
||||
# Create the first experiment
|
||||
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
|
||||
print(f"Created benchmark experiment: {experiment.id}")
|
||||
print(f"View at: https://beaker.org/ex/{experiment.id}")
|
||||
print("-------")
|
||||
print("")
|
||||
|
||||
|
||||
perf_commands = []
|
||||
if has_aws_creds:
|
||||
perf_commands.extend([
|
||||
"mkdir -p ~/.aws",
|
||||
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
|
||||
])
|
||||
perf_commands.extend([
|
||||
f"pip install marker-pdf=={marker_version}",
|
||||
"pip install --upgrade torchvision",
|
||||
"pip install awscli",
|
||||
"aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/",
|
||||
# Tried with workers 8, but it was taking a really huge amount of time
|
||||
#"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
|
||||
"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker"
|
||||
])
|
||||
|
||||
# Build performance task spec
|
||||
perf_task_spec_args = {
|
||||
"name": "marker-performance",
|
||||
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
|
||||
"command": [
|
||||
"bash", "-c",
|
||||
" && ".join(perf_commands)
|
||||
],
|
||||
"context": TaskContext(
|
||||
priority=Priority.normal,
|
||||
preemptible=True,
|
||||
),
|
||||
"resources": TaskResources(gpu_count=1),
|
||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
"result": ResultSpec(path="/noop-results"),
|
||||
}
|
||||
|
||||
# Add env vars if AWS credentials exist
|
||||
if has_aws_creds:
|
||||
perf_task_spec_args["env_vars"] = [
|
||||
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
|
||||
]
|
||||
|
||||
# Create performance experiment spec
|
||||
perf_experiment_spec = ExperimentSpec(
|
||||
description=f"Marker {marker_version} Performance Test - Branch: {git_branch}, Commit: {git_hash}",
|
||||
budget="ai2/oe-data",
|
||||
tasks=[TaskSpec(**perf_task_spec_args)],
|
||||
)
|
||||
|
||||
# Create the performance experiment
|
||||
perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr")
|
||||
print(f"Created performance experiment: {perf_experiment.id}")
|
||||
print(f"View at: https://beaker.org/ex/{perf_experiment.id}")
|
||||
EOF
|
||||
|
||||
# Run the Python script to create the experiments
|
||||
echo "Creating Beaker experiments..."
|
||||
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH $MARKER_VERSION
|
||||
|
||||
# Clean up temporary file
|
||||
rm /tmp/run_benchmark_experiment.py
|
||||
|
||||
echo "Benchmark experiments submitted successfully!"
|
9
scripts/sync_beaker_image.sh
Executable file
9
scripts/sync_beaker_image.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
|
||||
echo "$VERSION"
|
||||
|
||||
docker pull alleninstituteforai/olmocr:v$VERSION
|
||||
beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION alleninstituteforai/olmocr:v$VERSION
|
@ -1,400 +0,0 @@
|
||||
# The idea is that you have a Qwen2-VL-7B model located here:s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/"
|
||||
|
||||
# You need to load it in both hugging face transformers, and send page 1 of edgar.pdf to it from tests/gnarly_pdfs
|
||||
# Compare that the temperature 0 sampled result is the same
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from httpx import AsyncClient
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
|
||||
|
||||
from olmocr.pipeline import (
|
||||
SGLANG_SERVER_PORT,
|
||||
build_page_query,
|
||||
get_anchor_text,
|
||||
render_pdf_to_base64png,
|
||||
sglang_server_ready,
|
||||
sglang_server_task,
|
||||
)
|
||||
from olmocr.prompts import PageResponse
|
||||
|
||||
MODEL_FINETUNED_PATH = (
|
||||
"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
# Mock arguments
|
||||
self.args = AsyncMock()
|
||||
self.args.workspace = "/tmp/test_workspace"
|
||||
self.args.model = [MODEL_FINETUNED_PATH]
|
||||
self.args.model_chat_template = "qwen2-vl"
|
||||
self.args.target_longest_image_dim = 1024
|
||||
self.args.target_anchor_text_len = 6000
|
||||
self.args.model_max_context = 8192
|
||||
|
||||
# Create a temporary workspace directory
|
||||
os.makedirs(self.args.workspace, exist_ok=True)
|
||||
|
||||
# Set up a semaphore for server tasks
|
||||
self.semaphore = asyncio.Semaphore(1)
|
||||
self.maxDiff = None
|
||||
|
||||
# # Start the sglang server
|
||||
# self.my_server_task = asyncio.create_task(sglang_server_task(self.args, self.semaphore))
|
||||
|
||||
# # Wait for the server to become ready
|
||||
# await sglang_server_ready()
|
||||
|
||||
async def test_sglang_server_initialization_and_request(self):
|
||||
# Mock data paths
|
||||
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
|
||||
|
||||
# Send a single request to the sglang server for page 1
|
||||
async with AsyncClient(timeout=600) as session:
|
||||
query = await build_page_query(
|
||||
str(self.test_pdf_path),
|
||||
page=1,
|
||||
target_longest_image_dim=self.args.target_longest_image_dim,
|
||||
target_anchor_text_len=self.args.target_anchor_text_len,
|
||||
)
|
||||
COMPLETION_URL = f"http://localhost:{30000}/v1/chat/completions"
|
||||
|
||||
query["temperature"] = 0.0
|
||||
query["logprobs"] = True
|
||||
query["top_logprobs"] = 5
|
||||
response = await session.post(COMPLETION_URL, json=query)
|
||||
|
||||
print(response.text)
|
||||
|
||||
# Check the server response
|
||||
self.assertEqual(response.status_code, 200)
|
||||
response_data = response.json()
|
||||
self.assertIn("choices", response_data)
|
||||
self.assertGreater(len(response_data["choices"]), 0)
|
||||
|
||||
model_response_json = json.loads(response_data["choices"][0]["message"]["content"])
|
||||
page_response = PageResponse(**model_response_json)
|
||||
|
||||
print(page_response)
|
||||
|
||||
self.assertEqual(page_response.natural_text, EDGAR_TEXT)
|
||||
|
||||
async def asyncTearDown(self):
|
||||
pass
|
||||
# # Shut down the server
|
||||
# self.my_server_task.cancel()
|
||||
# with self.assertRaises(asyncio.CancelledError):
|
||||
# await self.my_server_task
|
||||
|
||||
# # Cleanup temporary workspace
|
||||
# if os.path.exists(self.args.workspace):
|
||||
# for root, _, files in os.walk(self.args.workspace):
|
||||
# for file in files:
|
||||
# os.unlink(os.path.join(root, file))
|
||||
# os.rmdir(self.args.workspace)
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
# Set up the Hugging Face model and tokenizer
|
||||
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
|
||||
download_directory([MODEL_FINETUNED_PATH], model_cache_dir)
|
||||
|
||||
# Check the rope config and make sure it's got the proper key
|
||||
with open(os.path.join(model_cache_dir, "config.json"), "r") as cfin:
|
||||
config_data = json.load(cfin)
|
||||
|
||||
if "rope_type" in config_data["rope_scaling"]:
|
||||
del config_data["rope_scaling"]["rope_type"]
|
||||
config_data["rope_scaling"]["type"] = "mrope"
|
||||
|
||||
with open(os.path.join(model_cache_dir, "config.json"), "w") as cfout:
|
||||
json.dump(config_data, cfout)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_cache_dir, trust_remote_code=True)
|
||||
self.image_token_id = self.tokenizer.encode("<|image_pad|>")[0]
|
||||
|
||||
self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_cache_dir, torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
|
||||
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
# Path to the test PDF
|
||||
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
|
||||
self.maxDiff = None
|
||||
|
||||
async def test_hugging_face_generation(self):
|
||||
query = await build_page_query(
|
||||
str(self.test_pdf_path),
|
||||
page=1,
|
||||
target_longest_image_dim=1024,
|
||||
target_anchor_text_len=6000,
|
||||
)
|
||||
|
||||
messages = query["messages"]
|
||||
|
||||
# Apply chat template to get the text
|
||||
text = self.processor.apply_chat_template(query["messages"], tokenize=False, add_generation_prompt=True)
|
||||
|
||||
image_url = query["messages"][0]["content"][1]["image_url"]["url"]
|
||||
|
||||
# Remove the "data:image/png;base64," prefix
|
||||
base64_image = image_url.split(",")[1]
|
||||
|
||||
# Decode the base64 string into bytes
|
||||
image_data = base64.b64decode(base64_image)
|
||||
|
||||
# Create a BytesIO object and load it into a PIL image
|
||||
main_image = Image.open(BytesIO(image_data))
|
||||
|
||||
# Process inputs using processor
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=[main_image],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
image_indices = [idx for idx, token in enumerate(inputs["input_ids"][0]) if token.item() == self.image_token_id]
|
||||
|
||||
print("IMAGE INDICES", image_indices)
|
||||
|
||||
print(f"image_grid_thw - {inputs['image_grid_thw'].shape} {inputs['image_grid_thw']}")
|
||||
print(f"pixel_values - {inputs['pixel_values'].shape} {inputs['pixel_values'].detach().cpu().numpy()}")
|
||||
np.save("/root/pixel_values.npy", inputs["pixel_values"].detach().cpu().numpy())
|
||||
|
||||
inputs = {key: value.to(self.device) for (key, value) in inputs.items()}
|
||||
|
||||
generated_tokens = []
|
||||
max_steps = 50
|
||||
|
||||
top_logprobs_hf = []
|
||||
|
||||
for step in range(max_steps):
|
||||
# Generate the output with temperature=0
|
||||
generation_output = self.model.generate(
|
||||
**inputs,
|
||||
temperature=0.0,
|
||||
max_new_tokens=1,
|
||||
# max_length=8192,
|
||||
num_return_sequences=1,
|
||||
do_sample=False,
|
||||
output_scores=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Extract the generated token's log probabilities
|
||||
scores = generation_output.scores # Tuple of length 1
|
||||
logits = scores[0] # Tensor of shape (batch_size, vocab_size)
|
||||
log_probs = F.log_softmax(logits, dim=-1) # Apply log softmax to get log probabilities
|
||||
|
||||
# Get top 5 tokens and their log probabilities
|
||||
topk_log_probs, topk_indices = torch.topk(log_probs[0], k=5)
|
||||
topk_tokens = self.tokenizer.convert_ids_to_tokens(topk_indices.tolist())
|
||||
|
||||
top_logprobs_hf.append((topk_tokens, topk_log_probs.tolist()))
|
||||
|
||||
# Pick the top token
|
||||
next_token_id = topk_indices[0].unsqueeze(0).unsqueeze(0) # Shape: (1, 1)
|
||||
next_token_str = self.tokenizer.convert_ids_to_tokens([next_token_id.item()])[0]
|
||||
|
||||
generated_tokens.append(next_token_id.item())
|
||||
|
||||
# Append the next token to input_ids and update attention_mask
|
||||
inputs["input_ids"] = torch.cat([inputs["input_ids"], next_token_id], dim=-1)
|
||||
inputs["attention_mask"] = torch.cat([inputs["attention_mask"], torch.ones((1, 1), dtype=inputs["attention_mask"].dtype).to(self.device)], dim=-1)
|
||||
|
||||
print(self.tokenizer.decode(generated_tokens))
|
||||
|
||||
# Now take all the input ids and run them through sglang as a comparison
|
||||
async with AsyncClient(timeout=600) as session:
|
||||
query["temperature"] = 0.0
|
||||
query["max_tokens"] = max_steps
|
||||
query["logprobs"] = True
|
||||
query["top_logprobs"] = 5
|
||||
COMPLETION_URL = f"http://localhost:{30000}/v1/chat/completions"
|
||||
response = await session.post(COMPLETION_URL, json=query)
|
||||
|
||||
response_data = response.json()
|
||||
|
||||
for step, lptok in enumerate(response_data["choices"][0]["logprobs"]["content"]):
|
||||
print("\nTop 5 tokens and their log probabilities:")
|
||||
(topk_tokens, topk_log_probs) = top_logprobs_hf[step]
|
||||
for token, log_prob, lptokcur in zip(topk_tokens, topk_log_probs, lptok["top_logprobs"]):
|
||||
print(
|
||||
f"HF Token: {token} Log Prob: {log_prob:.2f} Prob {math.exp(log_prob)*100:.2f}% SGLANG Token {lptokcur['token']} Logprob {lptokcur['logprob']:.2f} Prob {math.exp(lptokcur['logprob'])*100:.2f}%"
|
||||
)
|
||||
|
||||
async def asyncTearDown(self):
|
||||
# Clean up the model and tokenizer
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
|
||||
def setUp(self):
|
||||
# Set up the Hugging Face model and tokenizer
|
||||
model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
|
||||
download_directory([MODEL_FINETUNED_PATH], model_cache_dir)
|
||||
|
||||
# Check the rope config and make sure it's got the proper key
|
||||
with open(os.path.join(model_cache_dir, "config.json"), "r") as cfin:
|
||||
config_data = json.load(cfin)
|
||||
|
||||
if "rope_type" in config_data["rope_scaling"]:
|
||||
del config_data["rope_scaling"]["rope_type"]
|
||||
config_data["rope_scaling"]["type"] = "mrope"
|
||||
|
||||
with open(os.path.join(model_cache_dir, "config.json"), "w") as cfout:
|
||||
json.dump(config_data, cfout)
|
||||
|
||||
self.model_cache_dir = model_cache_dir
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_cache_dir, trust_remote_code=True)
|
||||
self.image_token_id = self.tokenizer.encode("<|image_pad|>")[0]
|
||||
|
||||
self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_cache_dir, torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
|
||||
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
# Path to the test PDF
|
||||
self.test_pdf_path = Path(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ambiguous.pdf"))
|
||||
self.maxDiff = None
|
||||
|
||||
async def test_vision_encoder(self):
|
||||
query = await build_page_query(
|
||||
str(self.test_pdf_path),
|
||||
page=1,
|
||||
target_longest_image_dim=1024,
|
||||
target_anchor_text_len=6000,
|
||||
)
|
||||
|
||||
messages = query["messages"]
|
||||
|
||||
# Apply chat template to get the text
|
||||
text = self.processor.apply_chat_template(query["messages"], tokenize=False, add_generation_prompt=True)
|
||||
|
||||
image_url = query["messages"][0]["content"][1]["image_url"]["url"]
|
||||
|
||||
# Remove the "data:image/png;base64," prefix
|
||||
base64_image = image_url.split(",")[1]
|
||||
|
||||
# Decode the base64 string into bytes
|
||||
image_data = base64.b64decode(base64_image)
|
||||
|
||||
# Create a BytesIO object and load it into a PIL image
|
||||
main_image = Image.open(BytesIO(image_data))
|
||||
|
||||
# Process inputs using processor
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=[main_image],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
hf_output = self.model.visual(inputs["pixel_values"].to(self.device), grid_thw=inputs["image_grid_thw"].to(self.device))
|
||||
|
||||
print("HF", hf_output, hf_output.shape)
|
||||
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
|
||||
model_config = ModelConfig(self.model_cache_dir, model_override_args="{}")
|
||||
|
||||
server_args = ServerArgs(model_path=self.model_cache_dir)
|
||||
# Initialize model runner
|
||||
model_runner = ModelRunner(
|
||||
model_config=model_config,
|
||||
mem_fraction_static=0.8,
|
||||
gpu_id=0,
|
||||
tp_rank=0,
|
||||
tp_size=1,
|
||||
nccl_port=12435,
|
||||
server_args=server_args,
|
||||
)
|
||||
|
||||
print(model_runner)
|
||||
with torch.no_grad():
|
||||
sglang_output = model_runner.model.visual(inputs["pixel_values"].to(self.device), grid_thw=inputs["image_grid_thw"].to(self.device))
|
||||
|
||||
print("SGLANG", sglang_output, sglang_output.shape)
|
||||
|
||||
# Convert to float32 for numerical stability if needed
|
||||
hf = hf_output.float()
|
||||
sg = sglang_output.float()
|
||||
|
||||
# Basic shape and dtype comparison
|
||||
print("\n=== Basic Properties ===")
|
||||
print(f"Shapes match: {hf.shape == sg.shape}")
|
||||
print(f"HF shape: {hf.shape}, SGLang shape: {sg.shape}")
|
||||
print(f"HF dtype: {hf.dtype}, SGLang dtype: {sg.dtype}")
|
||||
|
||||
# Move tensors to CPU for numpy operations
|
||||
hf_np = hf.cpu().numpy()
|
||||
sg_np = sg.cpu().numpy()
|
||||
|
||||
# Statistical metrics
|
||||
print("\n=== Statistical Metrics ===")
|
||||
print(f"Mean absolute difference: {torch.mean(torch.abs(hf - sg)).item():.6f}")
|
||||
print(f"Max absolute difference: {torch.max(torch.abs(hf - sg)).item():.6f}")
|
||||
print(f"Mean squared error: {torch.mean((hf - sg) ** 2).item():.6f}")
|
||||
print(f"Root mean squared error: {torch.sqrt(torch.mean((hf - sg) ** 2)).item():.6f}")
|
||||
|
||||
# Cosine similarity (across feature dimension)
|
||||
cos_sim = F.cosine_similarity(hf, sg)
|
||||
print(f"Mean cosine similarity: {torch.mean(cos_sim).item():.6f}")
|
||||
print(f"Min cosine similarity: {torch.min(cos_sim).item():.6f}")
|
||||
|
||||
# Find largest absolute differences
|
||||
print("\n=== Largest Absolute Differences ===")
|
||||
diffs = torch.abs(hf - sg)
|
||||
flat_diffs = diffs.flatten()
|
||||
|
||||
# Get indices of top 10 differences
|
||||
top_k = 10
|
||||
top_values, top_flat_indices = torch.topk(flat_diffs, top_k)
|
||||
|
||||
# Convert flat indices to multidimensional indices
|
||||
top_indices = np.unravel_index(top_flat_indices.cpu().numpy(), diffs.shape)
|
||||
|
||||
print(f"\nTop {top_k} largest absolute differences:")
|
||||
print("Index".ljust(30) + "Difference".ljust(15) + "HF Value".ljust(15) + "SGLang Value")
|
||||
print("-" * 75)
|
||||
|
||||
for i in range(top_k):
|
||||
# Get the index tuple for this difference
|
||||
idx = tuple(dim[i] for dim in top_indices)
|
||||
diff_val = top_values[i].item()
|
||||
hf_val = hf[idx].item()
|
||||
sg_val = sg[idx].item()
|
||||
|
||||
# Format the index tuple and values
|
||||
idx_str = str(idx)
|
||||
print(f"{idx_str:<30}{diff_val:<15.6f}{hf_val:<15.6f}{sg_val:.6f}")
|
Loading…
x
Reference in New Issue
Block a user