mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-21 05:00:06 +00:00
Fixes
This commit is contained in:
parent
5695e46a21
commit
80f18cc2bc
28
README.md
28
README.md
@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
|
|||||||
|
|
||||||
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
|
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
|
||||||
|
|
||||||
|
#### Viewing Results
|
||||||
|
|
||||||
|
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
||||||
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat localworkspace/markdown/olmocr-sample.md
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
### Using an Inference Provider or External Server
|
### Using an Inference Provider or External Server
|
||||||
|
|
||||||
If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
|
If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
|
||||||
@ -241,20 +255,6 @@ Notes on arguments
|
|||||||
- Other arguments work the same as with local inference
|
- Other arguments work the same as with local inference
|
||||||
|
|
||||||
|
|
||||||
#### Viewing Results
|
|
||||||
|
|
||||||
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
|
||||||
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cat localworkspace/markdown/olmocr-sample.md
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
### Multi-node / Cluster Usage
|
### Multi-node / Cluster Usage
|
||||||
|
|
||||||
If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports
|
If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
|
||||||
import tempfile
|
|
||||||
import glob
|
import glob
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
@ -10,6 +8,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
@ -37,13 +36,7 @@ total_output_tokens = 0
|
|||||||
def get_git_commit_hash():
|
def get_git_commit_hash():
|
||||||
"""Get the current git commit hash, if available."""
|
"""Get the current git commit hash, if available."""
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
|
||||||
["git", "rev-parse", "HEAD"],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
check=True
|
|
||||||
)
|
|
||||||
return result.stdout.strip()
|
return result.stdout.strip()
|
||||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||||
# Git not available or not a git repository
|
# Git not available or not a git repository
|
||||||
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Check if response was complete
|
# Check if response was complete
|
||||||
if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
|
if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
|
||||||
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
|
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Check if response was complete
|
# Check if response was complete
|
||||||
if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
|
if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
|
||||||
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
|
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
print("Warning: No HTML code block found in initial response")
|
print("Warning: No HTML code block found in initial response")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison
|
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison
|
||||||
# Create a temporary PDF file
|
# Create a temporary PDF file
|
||||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||||||
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
model="claude-sonnet-4-5-20250929",
|
model="claude-sonnet-4-5-20250929",
|
||||||
max_tokens=40000,
|
max_tokens=40000,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
thinking={
|
thinking={"type": "enabled", "budget_tokens": 12000},
|
||||||
"type": "enabled",
|
|
||||||
"budget_tokens": 12000
|
|
||||||
},
|
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
|
||||||
|
},
|
||||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
||||||
{"type": "text", "text": "Above is the ORIGINAL document."},
|
{"type": "text", "text": "Above is the ORIGINAL document."},
|
||||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
|
||||||
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
|
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
|
||||||
"Provide a REVISED version of the HTML that corrects any issues you identified. "
|
"Provide a REVISED version of the HTML that corrects any issues you identified. "
|
||||||
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
|
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
|
||||||
"Output the complete revised HTML in a ```html code block."
|
"Output the complete revised HTML in a ```html code block.",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
|
|||||||
refinement_response = await refinement_stream.get_final_message()
|
refinement_response = await refinement_stream.get_final_message()
|
||||||
|
|
||||||
# Check if refinement response was complete
|
# Check if refinement response was complete
|
||||||
if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
|
if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
|
||||||
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
|
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
|
||||||
# Return initial HTML as fallback since it was complete
|
# Return initial HTML as fallback since it was complete
|
||||||
return initial_html
|
return initial_html
|
||||||
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
|
|||||||
# So add in the bulk of the test cases back in now
|
# So add in the bulk of the test cases back in now
|
||||||
tests.extend(table_tests)
|
tests.extend(table_tests)
|
||||||
|
|
||||||
|
|
||||||
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content
|
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content
|
||||||
# Convert HTML to markdown to get cleaner text for presence and ordering tests
|
# Convert HTML to markdown to get cleaner text for presence and ordering tests
|
||||||
markdown_content = html_to_markdown_with_frontmatter(html_content)
|
markdown_content = html_to_markdown_with_frontmatter(html_content)
|
||||||
|
@ -13,11 +13,12 @@ import time
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional, Set, Tuple
|
from typing import Dict, Optional, Set, Tuple
|
||||||
from olmocr.image_utils import convert_image_to_pdf_bytes
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||||
|
|
||||||
|
|
||||||
def fix_image_url(url: str) -> str:
|
def fix_image_url(url: str) -> str:
|
||||||
"""Fix image URL to use full resolution instead of percentage-based sizing."""
|
"""Fix image URL to use full resolution instead of percentage-based sizing."""
|
||||||
|
@ -99,6 +99,7 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
from olmocr.image_utils import convert_image_to_pdf_bytes
|
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||||
|
|
||||||
|
|
||||||
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
|
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
|
||||||
"""Download image from URL with exponential backoff retry logic."""
|
"""Download image from URL with exponential backoff retry logic."""
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
|
@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
|
|||||||
"source_file": source_file,
|
"source_file": source_file,
|
||||||
"metadata": metadata,
|
"metadata": metadata,
|
||||||
"pdf_page_numbers": pdf_page_numbers,
|
"pdf_page_numbers": pdf_page_numbers,
|
||||||
"page_response_data": page_response_data
|
"page_response_data": page_response_data,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error parsing JSONL entry: {e}")
|
logger.error(f"Error parsing JSONL entry: {e}")
|
||||||
|
@ -1121,7 +1121,6 @@ async def main():
|
|||||||
)
|
)
|
||||||
server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
|
server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
|
||||||
|
|
||||||
|
|
||||||
vllm_group = parser.add_argument_group(
|
vllm_group = parser.add_argument_group(
|
||||||
"VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
|
"VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
|
||||||
)
|
)
|
||||||
@ -1133,7 +1132,6 @@ async def main():
|
|||||||
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
|
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
|
||||||
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
|
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
|
||||||
|
|
||||||
|
|
||||||
# Beaker/job running stuff
|
# Beaker/job running stuff
|
||||||
beaker_group = parser.add_argument_group("beaker/cluster execution")
|
beaker_group = parser.add_argument_group("beaker/cluster execution")
|
||||||
beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")
|
beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user