diff --git a/README.md b/README.md index 0be6b8f..03d0273 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/* With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. +#### Viewing Results + +The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`). + + +```bash +cat localworkspace/markdown/olmocr-sample.md +``` + +``` +olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models +... +``` + ### Using an Inference Provider or External Server If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance: @@ -241,20 +255,6 @@ Notes on arguments - Other arguments work the same as with local inference -#### Viewing Results - -The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`). - - -```bash -cat localworkspace/markdown/olmocr-sample.md -``` - -``` -olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models -... -``` - ### Multi-node / Cluster Usage If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 399810f..eade358 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -1,7 +1,5 @@ import argparse import asyncio -import base64 -import tempfile import glob import hashlib import json @@ -10,6 +8,7 @@ import os import random import re import subprocess +import tempfile import uuid from collections import defaultdict from typing import Dict, List @@ -37,13 +36,7 @@ total_output_tokens = 0 def get_git_commit_hash(): """Get the current git commit hash, if available.""" try: - result = subprocess.run( - ["git", "rev-parse", "HEAD"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) + result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) return result.stdout.strip() except (subprocess.CalledProcessError, FileNotFoundError): # Git not available or not a git repository @@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64): ) # Check if response was complete - if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn': + if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn": print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})") return None @@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64): ) # Check if response was complete - if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn': + if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn": print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})") return None @@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64): print("Warning: No HTML code block found in initial response") return None - # Step 3: Render the initial HTML to PDF and then back to PNG for comparison # Create a temporary PDF file with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: @@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64): model="claude-sonnet-4-5-20250929", max_tokens=40000, temperature=1.0, - thinking={ - "type": "enabled", - "budget_tokens": 12000 - }, + thinking={"type": "enabled", "budget_tokens": 12000}, messages=[ { "role": "user", "content": [ - {"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."}, + { + "type": "text", + "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.", + }, {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, {"type": "text", "text": "Above is the ORIGINAL document."}, {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}}, @@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64): f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n" "Provide a REVISED version of the HTML that corrects any issues you identified. " "Make sure all important elements are visible and the layout matches the original as closely as possible.\n" - "Output the complete revised HTML in a ```html code block." + "Output the complete revised HTML in a ```html code block.", }, ], } @@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64): refinement_response = await refinement_stream.get_final_message() # Check if refinement response was complete - if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn': + if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn": print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})") # Return initial HTML as fallback since it was complete return initial_html @@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand # So add in the bulk of the test cases back in now tests.extend(table_tests) - # Step 3: Generate TextPresenceTests and OrderingTests from markdown content # Convert HTML to markdown to get cleaner text for presence and ordering tests markdown_content = html_to_markdown_with_frontmatter(html_content) @@ -1290,7 +1281,7 @@ async def process_pdf(pdf_info, args, client, pdf_filter=None): if not html_content: print(f"Failed to generate HTML for {pdf_path}, page {page_num}") return None - + # Add git commit meta tag if available git_commit = get_git_commit_hash() if git_commit: diff --git a/olmocr/data/prepare_loc_transcripts.py b/olmocr/data/prepare_loc_transcripts.py index 366c2b5..afdb5d4 100644 --- a/olmocr/data/prepare_loc_transcripts.py +++ b/olmocr/data/prepare_loc_transcripts.py @@ -13,11 +13,12 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Dict, Optional, Set, Tuple -from olmocr.image_utils import convert_image_to_pdf_bytes import requests from tqdm import tqdm +from olmocr.image_utils import convert_image_to_pdf_bytes + def fix_image_url(url: str) -> str: """Fix image URL to use full resolution instead of percentage-based sizing.""" diff --git a/olmocr/data/prepare_national_archive_transcripts.py b/olmocr/data/prepare_national_archive_transcripts.py index 40986df..23d38e0 100644 --- a/olmocr/data/prepare_national_archive_transcripts.py +++ b/olmocr/data/prepare_national_archive_transcripts.py @@ -99,6 +99,7 @@ from tqdm import tqdm from olmocr.image_utils import convert_image_to_pdf_bytes + def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool: """Download image from URL with exponential backoff retry logic.""" for attempt in range(max_retries): diff --git a/olmocr/data/prepare_workspace.py b/olmocr/data/prepare_workspace.py index fda5b7e..ea00860 100755 --- a/olmocr/data/prepare_workspace.py +++ b/olmocr/data/prepare_workspace.py @@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]: "source_file": source_file, "metadata": metadata, "pdf_page_numbers": pdf_page_numbers, - "page_response_data": page_response_data + "page_response_data": page_response_data, } except Exception as e: logger.error(f"Error parsing JSONL entry: {e}") diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 2845d8a..5bb78ca 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -214,17 +214,17 @@ async def apost(url, json_data, api_key=None): # Read chunk size line size_line = await reader.readline() chunk_size = int(size_line.strip(), 16) # Hex format - + if chunk_size == 0: await reader.readline() # Read final CRLF break - + chunk_data = await reader.readexactly(chunk_size) chunks.append(chunk_data) - + # Read trailing CRLF after chunk data await reader.readline() - + response_body = b"".join(chunks) elif headers.get("connection", "") == "close": # Read until connection closes @@ -1121,7 +1121,6 @@ async def main(): ) server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)") - vllm_group = parser.add_argument_group( "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." ) @@ -1133,7 +1132,6 @@ async def main(): vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") - # Beaker/job running stuff beaker_group = parser.add_argument_group("beaker/cluster execution") beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")