mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-20 12:39:23 +00:00
Fixes
This commit is contained in:
parent
5695e46a21
commit
80f18cc2bc
28
README.md
28
README.md
@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
|
||||
|
||||
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
|
||||
|
||||
#### Viewing Results
|
||||
|
||||
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
||||
|
||||
|
||||
```bash
|
||||
cat localworkspace/markdown/olmocr-sample.md
|
||||
```
|
||||
|
||||
```
|
||||
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
|
||||
...
|
||||
```
|
||||
|
||||
### Using an Inference Provider or External Server
|
||||
|
||||
If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
|
||||
@ -241,20 +255,6 @@ Notes on arguments
|
||||
- Other arguments work the same as with local inference
|
||||
|
||||
|
||||
#### Viewing Results
|
||||
|
||||
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
||||
|
||||
|
||||
```bash
|
||||
cat localworkspace/markdown/olmocr-sample.md
|
||||
```
|
||||
|
||||
```
|
||||
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
|
||||
...
|
||||
```
|
||||
|
||||
### Multi-node / Cluster Usage
|
||||
|
||||
If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports
|
||||
|
@ -1,7 +1,5 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import tempfile
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
@ -10,6 +8,7 @@ import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List
|
||||
@ -37,13 +36,7 @@ total_output_tokens = 0
|
||||
def get_git_commit_hash():
|
||||
"""Get the current git commit hash, if available."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "HEAD"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
# Git not available or not a git repository
|
||||
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
|
||||
)
|
||||
|
||||
# Check if response was complete
|
||||
if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
|
||||
if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
|
||||
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
|
||||
return None
|
||||
|
||||
@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
|
||||
)
|
||||
|
||||
# Check if response was complete
|
||||
if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
|
||||
if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
|
||||
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
|
||||
return None
|
||||
|
||||
@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
|
||||
print("Warning: No HTML code block found in initial response")
|
||||
return None
|
||||
|
||||
|
||||
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison
|
||||
# Create a temporary PDF file
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||||
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
max_tokens=40000,
|
||||
temperature=1.0,
|
||||
thinking={
|
||||
"type": "enabled",
|
||||
"budget_tokens": 12000
|
||||
},
|
||||
thinking={"type": "enabled", "budget_tokens": 12000},
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
|
||||
},
|
||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
||||
{"type": "text", "text": "Above is the ORIGINAL document."},
|
||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
|
||||
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
|
||||
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
|
||||
"Provide a REVISED version of the HTML that corrects any issues you identified. "
|
||||
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
|
||||
"Output the complete revised HTML in a ```html code block."
|
||||
"Output the complete revised HTML in a ```html code block.",
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
|
||||
refinement_response = await refinement_stream.get_final_message()
|
||||
|
||||
# Check if refinement response was complete
|
||||
if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
|
||||
if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
|
||||
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
|
||||
# Return initial HTML as fallback since it was complete
|
||||
return initial_html
|
||||
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
|
||||
# So add in the bulk of the test cases back in now
|
||||
tests.extend(table_tests)
|
||||
|
||||
|
||||
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content
|
||||
# Convert HTML to markdown to get cleaner text for presence and ordering tests
|
||||
markdown_content = html_to_markdown_with_frontmatter(html_content)
|
||||
@ -1290,7 +1281,7 @@ async def process_pdf(pdf_info, args, client, pdf_filter=None):
|
||||
if not html_content:
|
||||
print(f"Failed to generate HTML for {pdf_path}, page {page_num}")
|
||||
return None
|
||||
|
||||
|
||||
# Add git commit meta tag if available
|
||||
git_commit = get_git_commit_hash()
|
||||
if git_commit:
|
||||
|
@ -13,11 +13,12 @@ import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Set, Tuple
|
||||
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||
|
||||
|
||||
def fix_image_url(url: str) -> str:
|
||||
"""Fix image URL to use full resolution instead of percentage-based sizing."""
|
||||
|
@ -99,6 +99,7 @@ from tqdm import tqdm
|
||||
|
||||
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||
|
||||
|
||||
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
|
||||
"""Download image from URL with exponential backoff retry logic."""
|
||||
for attempt in range(max_retries):
|
||||
|
@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
|
||||
"source_file": source_file,
|
||||
"metadata": metadata,
|
||||
"pdf_page_numbers": pdf_page_numbers,
|
||||
"page_response_data": page_response_data
|
||||
"page_response_data": page_response_data,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing JSONL entry: {e}")
|
||||
|
@ -214,17 +214,17 @@ async def apost(url, json_data, api_key=None):
|
||||
# Read chunk size line
|
||||
size_line = await reader.readline()
|
||||
chunk_size = int(size_line.strip(), 16) # Hex format
|
||||
|
||||
|
||||
if chunk_size == 0:
|
||||
await reader.readline() # Read final CRLF
|
||||
break
|
||||
|
||||
|
||||
chunk_data = await reader.readexactly(chunk_size)
|
||||
chunks.append(chunk_data)
|
||||
|
||||
|
||||
# Read trailing CRLF after chunk data
|
||||
await reader.readline()
|
||||
|
||||
|
||||
response_body = b"".join(chunks)
|
||||
elif headers.get("connection", "") == "close":
|
||||
# Read until connection closes
|
||||
@ -1121,7 +1121,6 @@ async def main():
|
||||
)
|
||||
server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
|
||||
|
||||
|
||||
vllm_group = parser.add_argument_group(
|
||||
"VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
|
||||
)
|
||||
@ -1133,7 +1132,6 @@ async def main():
|
||||
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
|
||||
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
|
||||
|
||||
|
||||
# Beaker/job running stuff
|
||||
beaker_group = parser.add_argument_group("beaker/cluster execution")
|
||||
beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")
|
||||
|
Loading…
x
Reference in New Issue
Block a user