This commit is contained in:
Jake Poznanski 2025-10-15 21:14:53 +00:00
parent 5695e46a21
commit 80f18cc2bc
6 changed files with 34 additions and 43 deletions

View File

@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
#### Viewing Results
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
```bash
cat localworkspace/markdown/olmocr-sample.md
```
```
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
...
```
### Using an Inference Provider or External Server
If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
@ -241,20 +255,6 @@ Notes on arguments
- Other arguments work the same as with local inference
#### Viewing Results
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
```bash
cat localworkspace/markdown/olmocr-sample.md
```
```
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
...
```
### Multi-node / Cluster Usage
If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports

View File

@ -1,7 +1,5 @@
import argparse
import asyncio
import base64
import tempfile
import glob
import hashlib
import json
@ -10,6 +8,7 @@ import os
import random
import re
import subprocess
import tempfile
import uuid
from collections import defaultdict
from typing import Dict, List
@ -37,13 +36,7 @@ total_output_tokens = 0
def get_git_commit_hash():
"""Get the current git commit hash, if available."""
try:
result = subprocess.run(
["git", "rev-parse", "HEAD"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError):
# Git not available or not a git repository
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
)
# Check if response was complete
if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
return None
@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
)
# Check if response was complete
if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
return None
@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
print("Warning: No HTML code block found in initial response")
return None
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison
# Create a temporary PDF file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
model="claude-sonnet-4-5-20250929",
max_tokens=40000,
temperature=1.0,
thinking={
"type": "enabled",
"budget_tokens": 12000
},
thinking={"type": "enabled", "budget_tokens": 12000},
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
{
"type": "text",
"text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
},
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
{"type": "text", "text": "Above is the ORIGINAL document."},
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
"Provide a REVISED version of the HTML that corrects any issues you identified. "
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
"Output the complete revised HTML in a ```html code block."
"Output the complete revised HTML in a ```html code block.",
},
],
}
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
refinement_response = await refinement_stream.get_final_message()
# Check if refinement response was complete
if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
# Return initial HTML as fallback since it was complete
return initial_html
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
# So add in the bulk of the test cases back in now
tests.extend(table_tests)
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content
# Convert HTML to markdown to get cleaner text for presence and ordering tests
markdown_content = html_to_markdown_with_frontmatter(html_content)
@ -1290,7 +1281,7 @@ async def process_pdf(pdf_info, args, client, pdf_filter=None):
if not html_content:
print(f"Failed to generate HTML for {pdf_path}, page {page_num}")
return None
# Add git commit meta tag if available
git_commit = get_git_commit_hash()
if git_commit:

View File

@ -13,11 +13,12 @@ import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, Optional, Set, Tuple
from olmocr.image_utils import convert_image_to_pdf_bytes
import requests
from tqdm import tqdm
from olmocr.image_utils import convert_image_to_pdf_bytes
def fix_image_url(url: str) -> str:
"""Fix image URL to use full resolution instead of percentage-based sizing."""

View File

@ -99,6 +99,7 @@ from tqdm import tqdm
from olmocr.image_utils import convert_image_to_pdf_bytes
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
"""Download image from URL with exponential backoff retry logic."""
for attempt in range(max_retries):

View File

@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
"source_file": source_file,
"metadata": metadata,
"pdf_page_numbers": pdf_page_numbers,
"page_response_data": page_response_data
"page_response_data": page_response_data,
}
except Exception as e:
logger.error(f"Error parsing JSONL entry: {e}")

View File

@ -214,17 +214,17 @@ async def apost(url, json_data, api_key=None):
# Read chunk size line
size_line = await reader.readline()
chunk_size = int(size_line.strip(), 16) # Hex format
if chunk_size == 0:
await reader.readline() # Read final CRLF
break
chunk_data = await reader.readexactly(chunk_size)
chunks.append(chunk_data)
# Read trailing CRLF after chunk data
await reader.readline()
response_body = b"".join(chunks)
elif headers.get("connection", "") == "close":
# Read until connection closes
@ -1121,7 +1121,6 @@ async def main():
)
server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
vllm_group = parser.add_argument_group(
"VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
)
@ -1133,7 +1132,6 @@ async def main():
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
# Beaker/job running stuff
beaker_group = parser.add_argument_group("beaker/cluster execution")
beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")