Fixes

2025-12-13 08:11:22 +00:00 · 2025-10-15 21:14:53 +00:00 · 2025-10-15 21:14:53 +00:00 · 80f18cc2bc
commit 80f18cc2bc
parent 5695e46a21
6 changed files with 34 additions and 43 deletions
--- a/README.md
+++ b/README.md
@ -209,6 +209,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*

 With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. 

+#### Viewing Results
+
+The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
+
+
+```bash
+cat localworkspace/markdown/olmocr-sample.md 
+```
+
+```
+olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
+...
+```
+
 ### Using an Inference Provider or External Server

 If you have a vLLM server already running elsewhere (or any inference platform implementing the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
@ -241,20 +255,6 @@ Notes on arguments
 - Other arguments work the same as with local inference


-#### Viewing Results
-
-The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
-
-
-```bash
-cat localworkspace/markdown/olmocr-sample.md 
-```
-
-```
-olmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models
-...
-```
-
 ### Multi-node / Cluster Usage

 If you want to convert millions of PDFs, using multiple nodes running in parallel, then olmOCR supports
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -1,7 +1,5 @@
 import argparse
 import asyncio
-import base64
-import tempfile
 import glob
 import hashlib
 import json
@ -10,6 +8,7 @@ import os
 import random
 import re
 import subprocess
+import tempfile
 import uuid
 from collections import defaultdict
 from typing import Dict, List
@ -37,13 +36,7 @@ total_output_tokens = 0
 def get_git_commit_hash():
    """Get the current git commit hash, if available."""
    try:
-        result = subprocess.run(
-            ["git", "rev-parse", "HEAD"],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            check=True
-        )
+        result = subprocess.run(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
        return result.stdout.strip()
    except (subprocess.CalledProcessError, FileNotFoundError):
        # Git not available or not a git repository
@ -427,7 +420,7 @@ async def generate_html_from_image(client, image_base64):
        )

        # Check if response was complete
-        if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
+        if hasattr(analysis_response, "stop_reason") and analysis_response.stop_reason != "end_turn":
            print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
            return None

@ -472,7 +465,7 @@ async def generate_html_from_image(client, image_base64):
        )

        # Check if response was complete
-        if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
+        if hasattr(initial_response, "stop_reason") and initial_response.stop_reason != "end_turn":
            print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
            return None

@ -492,7 +485,6 @@ async def generate_html_from_image(client, image_base64):
            print("Warning: No HTML code block found in initial response")
            return None

-
        # Step 3: Render the initial HTML to PDF and then back to PNG for comparison
        # Create a temporary PDF file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
@ -520,15 +512,15 @@ async def generate_html_from_image(client, image_base64):
                model="claude-sonnet-4-5-20250929",
                max_tokens=40000,
                temperature=1.0,
-                thinking={
-                    "type": "enabled",
-                    "budget_tokens": 12000
-                },
+                thinking={"type": "enabled", "budget_tokens": 12000},
                messages=[
                    {
                        "role": "user",
                        "content": [
-                            {"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
+                            {
+                                "type": "text",
+                                "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original.",
+                            },
                            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
                            {"type": "text", "text": "Above is the ORIGINAL document."},
                            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
@ -546,7 +538,7 @@ async def generate_html_from_image(client, image_base64):
                                f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
                                "Provide a REVISED version of the HTML that corrects any issues you identified. "
                                "Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
-                                "Output the complete revised HTML in a ```html code block."
+                                "Output the complete revised HTML in a ```html code block.",
                            },
                        ],
                    }
@ -559,7 +551,7 @@ async def generate_html_from_image(client, image_base64):
                refinement_response = await refinement_stream.get_final_message()

            # Check if refinement response was complete
-            if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
+            if hasattr(refinement_response, "stop_reason") and refinement_response.stop_reason != "end_turn":
                print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
                # Return initial HTML as fallback since it was complete
                return initial_html
@ -1022,7 +1014,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
        # So add in the bulk of the test cases back in now
        tests.extend(table_tests)

-
    # Step 3: Generate TextPresenceTests and OrderingTests from markdown content
    # Convert HTML to markdown to get cleaner text for presence and ordering tests
    markdown_content = html_to_markdown_with_frontmatter(html_content)
@ -1290,7 +1281,7 @@ async def process_pdf(pdf_info, args, client, pdf_filter=None):
        if not html_content:
            print(f"Failed to generate HTML for {pdf_path}, page {page_num}")
            return None
-        
+
        # Add git commit meta tag if available
        git_commit = get_git_commit_hash()
        if git_commit:
--- a/olmocr/data/prepare_loc_transcripts.py
+++ b/olmocr/data/prepare_loc_transcripts.py
@ -13,11 +13,12 @@ import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, Optional, Set, Tuple
-from olmocr.image_utils import convert_image_to_pdf_bytes

 import requests
 from tqdm import tqdm

+from olmocr.image_utils import convert_image_to_pdf_bytes
+

 def fix_image_url(url: str) -> str:
    """Fix image URL to use full resolution instead of percentage-based sizing."""
--- a/olmocr/data/prepare_national_archive_transcripts.py
+++ b/olmocr/data/prepare_national_archive_transcripts.py
@ -99,6 +99,7 @@ from tqdm import tqdm

 from olmocr.image_utils import convert_image_to_pdf_bytes

+
 def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
    """Download image from URL with exponential backoff retry logic."""
    for attempt in range(max_retries):
--- a/olmocr/data/prepare_workspace.py
+++ b/olmocr/data/prepare_workspace.py
@ -113,7 +113,7 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
            "source_file": source_file,
            "metadata": metadata,
            "pdf_page_numbers": pdf_page_numbers,
-            "page_response_data": page_response_data
+            "page_response_data": page_response_data,
        }
    except Exception as e:
        logger.error(f"Error parsing JSONL entry: {e}")
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@ -214,17 +214,17 @@ async def apost(url, json_data, api_key=None):
                # Read chunk size line
                size_line = await reader.readline()
                chunk_size = int(size_line.strip(), 16)  # Hex format
-                
+
                if chunk_size == 0:
                    await reader.readline()  # Read final CRLF
                    break
-                
+
                chunk_data = await reader.readexactly(chunk_size)
                chunks.append(chunk_data)
-                
+
                # Read trailing CRLF after chunk data
                await reader.readline()
-            
+
            response_body = b"".join(chunks)
        elif headers.get("connection", "") == "close":
            # Read until connection closes
@ -1121,7 +1121,6 @@ async def main():
    )
    server_group.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")

-
    vllm_group = parser.add_argument_group(
        "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
    )
@ -1133,7 +1132,6 @@ async def main():
    vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
    vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")

-
    # Beaker/job running stuff
    beaker_group = parser.add_argument_group("beaker/cluster execution")
    beaker_group.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")