This commit is contained in:
Jake Poznanski 2025-10-15 21:14:53 +00:00
parent 5695e46a21
commit 80f18cc2bc
6 changed files with 34 additions and 43 deletions

View File

@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
#### Viewing Results
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
```bash
cat localworkspace/markdown/olmocr-sample.md
```
```
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
...
```
### Using an Inference Provider or External Server ### Using an Inference Provider or External Server
If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance: If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
@ -241,20 +255,6 @@ Notes on arguments
- Other arguments work the same as with local inference - Other arguments work the same as with local inference
#### Viewing Results
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
```bash
cat localworkspace/markdown/olmocr-sample.md
```
```
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
...
```
### Multi-node / Cluster Usage ### Multi-node / Cluster Usage
If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports

View File

@ -1,7 +1,5 @@
import argparse import argparse
import asyncio import asyncio
import base64
import tempfile
import glob import glob
import hashlib import hashlib
import json import json
@ -10,6 +8,7 @@ import os
import random import random
import re import re
import subprocess import subprocess
import tempfile
import uuid import uuid
from collections import defaultdict from collections import defaultdict
from typing import Dict, List from typing import Dict, List
@ -37,13 +36,7 @@ total_output_tokens = 0
def get_git_commit_hash(): def get_git_commit_hash():
"""Get the current git commit hash, if available.""" """Get the current git commit hash, if available."""
try: try:
result = subprocess.run( result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
["git", "rev-parse", "HEAD"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
return result.stdout.strip() return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError): except (subprocess.CalledProcessError, FileNotFoundError):
# Git not available or not a git repository # Git not available or not a git repository
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
) )
# Check if response was complete # Check if response was complete
if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn': if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})") print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
return None return None
@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
) )
# Check if response was complete # Check if response was complete
if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn': if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})") print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
return None return None
@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
print("Warning: No HTML code block found in initial response") print("Warning: No HTML code block found in initial response")
return None return None
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison # Step 3: Render the initial HTML to PDF and then back to PNG for comparison
# Create a temporary PDF file # Create a temporary PDF file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
model="claude-sonnet-4-5-20250929", model="claude-sonnet-4-5-20250929",
max_tokens=40000, max_tokens=40000,
temperature=1.0, temperature=1.0,
thinking={ thinking={"type": "enabled", "budget_tokens": 12000},
"type": "enabled",
"budget_tokens": 12000
},
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."}, {
"type": "text",
"text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
},
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
{"type": "text", "text": "Above is the ORIGINAL document."}, {"type": "text", "text": "Above is the ORIGINAL document."},
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}}, {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n" f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
"Provide a REVISED version of the HTML that corrects any issues you identified. " "Provide a REVISED version of the HTML that corrects any issues you identified. "
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n" "Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
"Output the complete revised HTML in a ```html code block." "Output the complete revised HTML in a ```html code block.",
}, },
], ],
} }
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
refinement_response = await refinement_stream.get_final_message() refinement_response = await refinement_stream.get_final_message()
# Check if refinement response was complete # Check if refinement response was complete
if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn': if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})") print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
# Return initial HTML as fallback since it was complete # Return initial HTML as fallback since it was complete
return initial_html return initial_html
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
# So add in the bulk of the test cases back in now # So add in the bulk of the test cases back in now
tests.extend(table_tests) tests.extend(table_tests)
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content # Step 3: Generate TextPresenceTests and OrderingTests from markdown content
# Convert HTML to markdown to get cleaner text for presence and ordering tests # Convert HTML to markdown to get cleaner text for presence and ordering tests
markdown_content = html_to_markdown_with_frontmatter(html_content) markdown_content = html_to_markdown_with_frontmatter(html_content)

View File

@ -13,11 +13,12 @@ import time
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
from typing import Dict, Optional, Set, Tuple from typing import Dict, Optional, Set, Tuple
from olmocr.image_utils import convert_image_to_pdf_bytes
import requests import requests
from tqdm import tqdm from tqdm import tqdm
from olmocr.image_utils import convert_image_to_pdf_bytes
def fix_image_url(url: str) -> str: def fix_image_url(url: str) -> str:
"""Fix image URL to use full resolution instead of percentage-based sizing.""" """Fix image URL to use full resolution instead of percentage-based sizing."""

View File

@ -99,6 +99,7 @@ from tqdm import tqdm
from olmocr.image_utils import convert_image_to_pdf_bytes from olmocr.image_utils import convert_image_to_pdf_bytes
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool: def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
"""Download image from URL with exponential backoff retry logic.""" """Download image from URL with exponential backoff retry logic."""
for attempt in range(max_retries): for attempt in range(max_retries):

View File

@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
"source_file": source_file, "source_file": source_file,
"metadata": metadata, "metadata": metadata,
"pdf_page_numbers": pdf_page_numbers, "pdf_page_numbers": pdf_page_numbers,
"page_response_data": page_response_data "page_response_data": page_response_data,
} }
except Exception as e: except Exception as e:
logger.error(f"Error parsing JSONL entry: {e}") logger.error(f"Error parsing JSONL entry: {e}")

View File

@ -1121,7 +1121,6 @@ async def main():
) )
server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)") server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
vllm_group = parser.add_argument_group( vllm_group = parser.add_argument_group(
"VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
) )
@ -1133,7 +1132,6 @@ async def main():
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
# Beaker/job running stuff # Beaker/job running stuff
beaker_group = parser.add_argument_group("beaker/cluster execution") beaker_group = parser.add_argument_group("beaker/cluster execution")
beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally") beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")