Fixes

2025-12-18 02:34:47 +00:00 · 2025-10-15 21:14:53 +00:00 · 2025-10-15 21:14:53 +00:00 · 80f18cc2bc
commit 80f18cc2bc
parent 5695e46a21
6 changed files with 34 additions and 43 deletions
--- a/README.md
+++ b/README.md
@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
 With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. 
 #### Viewing Results
 The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
 ```bash
 cat localworkspace/markdown/olmocr-sample.md 
 ```
 ```
 olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
 ...
 ```
 ### Using an Inference Provider or External Server
 If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
@ -241,20 +255,6 @@ Notes on arguments
 - Other arguments work the same as with local inference
 #### Viewing Results
 The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
 ```bash
 cat localworkspace/markdown/olmocr-sample.md 
 ```
 ```
 olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
 ...
 ```
 ### Multi-node / Cluster Usage
 If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -1,7 +1,5 @@
 import argparse
 import asyncio
 import base64
 import tempfile
 import glob
 import hashlib
 import json
@ -10,6 +8,7 @@ import os
 import random
 import re
 import subprocess
 import tempfile
 import uuid
 from collections import defaultdict
 from typing import Dict, List
@ -37,13 +36,7 @@ total_output_tokens = 0
 def get_git_commit_hash():
    """Get the current git commit hash, if available."""
    try:
-        result = subprocess.run(
+        result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
            ["git", "rev-parse", "HEAD"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        return result.stdout.strip()
    except (subprocess.CalledProcessError, FileNotFoundError):
        # Git not available or not a git repository
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
        )
        # Check if response was complete
-        if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
+        if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
            print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
            return None
@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
        )
        # Check if response was complete
-        if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
+        if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
            print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
            return None
@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
            print("Warning: No HTML code block found in initial response")
            return None
        # Step 3: Render the initial HTML to PDF and then back to PNG for comparison
        # Create a temporary PDF file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
                model="claude-sonnet-4-5-20250929",
                max_tokens=40000,
                temperature=1.0,
-                thinking={
+                thinking={"type": "enabled", "budget_tokens": 12000},
                    "type": "enabled",
                    "budget_tokens": 12000
                },
                messages=[
                    {
                        "role": "user",
                        "content": [
-                            {"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
+                            {
                                "type": "text",
                                "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
                            },
                            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
                            {"type": "text", "text": "Above is the ORIGINAL document."},
                            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
                                f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
                                "Provide a REVISED version of the HTML that corrects any issues you identified. "
                                "Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
-                                "Output the complete revised HTML in a ```html code block."
+                                "Output the complete revised HTML in a ```html code block.",
                            },
                        ],
                    }
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
                refinement_response = await refinement_stream.get_final_message()
            # Check if refinement response was complete
-            if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
+            if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
                print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
                # Return initial HTML as fallback since it was complete
                return initial_html
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
        # So add in the bulk of the test cases back in now
        tests.extend(table_tests)
    # Step 3: Generate TextPresenceTests and OrderingTests from markdown content
    # Convert HTML to markdown to get cleaner text for presence and ordering tests
    markdown_content = html_to_markdown_with_frontmatter(html_content)
--- a/olmocr/data/prepare_loc_transcripts.py
+++ b/olmocr/data/prepare_loc_transcripts.py
@ -13,11 +13,12 @@ import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, Optional, Set, Tuple
 from olmocr.image_utils import convert_image_to_pdf_bytes
 import requests
 from tqdm import tqdm
 from olmocr.image_utils import convert_image_to_pdf_bytes
 def fix_image_url(url: str) -> str:
    """Fix image URL to use full resolution instead of percentage-based sizing."""
--- a/olmocr/data/prepare_national_archive_transcripts.py
+++ b/olmocr/data/prepare_national_archive_transcripts.py
@ -99,6 +99,7 @@ from tqdm import tqdm
 from olmocr.image_utils import convert_image_to_pdf_bytes
 def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
    """Download image from URL with exponential backoff retry logic."""
    for attempt in range(max_retries):
--- a/olmocr/data/prepare_workspace.py
+++ b/olmocr/data/prepare_workspace.py
@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
            "source_file": source_file,
            "metadata": metadata,
            "pdf_page_numbers": pdf_page_numbers,
-            "page_response_data": page_response_data
+            "page_response_data": page_response_data,
        }
    except Exception as e:
        logger.error(f"Error parsing JSONL entry: {e}")
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@ -1121,7 +1121,6 @@ async def main():
    )
    server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
    vllm_group = parser.add_argument_group(
        "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
    )
@ -1133,7 +1132,6 @@ async def main():
    vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
    vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
    # Beaker/job running stuff
    beaker_group = parser.add_argument_group("beaker/cluster execution")
    beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")