From cb4f23dc0ca926c0afe88085074bf566b5348fbe Mon Sep 17 00:00:00 2001 From: Tong Liang Date: Sat, 16 Aug 2025 21:48:07 -0400 Subject: [PATCH 01/40] Fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b4c1327..fdda60b 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Features: - (Based on a 7B parameter VLM, so it requires a GPU) ### News - - August 13, 2024 - v0.3.0 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0825-FP8), fixes auto-rotation detection, and hallucinations on blank documents. + - August 13, 2025 - v0.3.0 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0825-FP8), fixes auto-rotation detection, and hallucinations on blank documents. - July 24, 2025 - v0.2.1 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0725-FP8), scores 3 points higher on [olmOCR-Bench](https://github.com/allenai/olmocr/tree/main/olmocr/bench), also runs significantly faster because it's default FP8, and needs much fewer retries per document. - July 23, 2025 - v0.2.0 - New cleaned up [trainer code](https://github.com/allenai/olmocr/tree/main/olmocr/train), makes it much simpler to train olmOCR models yourself. - June 17, 2025 - v0.1.75 - Switch from sglang to vllm based inference pipeline, updated docker image to CUDA 12.8. From b8a2b92174350dfbfdaf00015446567764988a59 Mon Sep 17 00:00:00 2001 From: Haydn Jones Date: Wed, 20 Aug 2025 19:21:38 -0400 Subject: [PATCH 02/40] External vLLM --- olmocr/pipeline.py | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 640a0aa..73d6603 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -213,7 +213,10 @@ async def apost(url, json_data): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: - COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" + if args.external_vllm_url: + COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions" + else: + COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" MAX_RETRIES = args.max_page_retries MODEL_MAX_CONTEXT = 16384 TEMPERATURE_BY_ATTEMPT = [0.1, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9, 1.0] @@ -607,6 +610,7 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non if unknown_args: cmd.extend(unknown_args) + breakpoint() proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, @@ -730,10 +734,13 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non sys.exit(1) -async def vllm_server_ready(): +async def vllm_server_ready(args): max_attempts = 300 delay_sec = 1 - url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" + if args.external_vllm_url: + url = f"{args.external_vllm_url.rstrip('/')}/v1/models" + else: + url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" for attempt in range(1, max_attempts + 1): try: @@ -1069,6 +1076,9 @@ async def main(): vllm_group.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="Tensor parallel size for vLLM") vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") + vllm_group.add_argument( + "--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" + ) # Beaker/job running stuff beaker_group = parser.add_argument_group("beaker/cluster execution") @@ -1207,12 +1217,17 @@ async def main(): # If you get this far, then you are doing inference and need a GPU # check_sglang_version() - check_torch_gpu_available() + if not args.external_vllm_url: + check_torch_gpu_available() logger.info(f"Starting pipeline with PID {os.getpid()}") # Download the model before you do anything else - model_name_or_path = await download_model(args.model) + if args.external_vllm_url: + logger.info(f"Using external vLLM server at {args.external_vllm_url}") + model_name_or_path = None + else: + model_name_or_path = await download_model(args.model) # Initialize the work queue qsize = await work_queue.initialize_queue() @@ -1226,9 +1241,12 @@ async def main(): # As soon as one worker is no longer saturating the gpu, the next one can start sending requests semaphore = asyncio.Semaphore(1) - vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args)) + # Start local vLLM instance if not using external one + vllm_server = None + if not args.external_vllm_url: + vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args)) - await vllm_server_ready() + await vllm_server_ready(args) metrics_task = asyncio.create_task(metrics_reporter(work_queue)) @@ -1241,11 +1259,16 @@ async def main(): # Wait for all worker tasks to finish await asyncio.gather(*worker_tasks) - vllm_server.cancel() + # Cancel vLLM server if it was started + if vllm_server is not None: + vllm_server.cancel() metrics_task.cancel() # Wait for cancelled tasks to complete - await asyncio.gather(vllm_server, metrics_task, return_exceptions=True) + tasks_to_wait = [metrics_task] + if vllm_server is not None: + tasks_to_wait.append(vllm_server) + await asyncio.gather(*tasks_to_wait, return_exceptions=True) # Output final metrics summary metrics_summary = metrics.get_metrics_summary() From b34c3611e1101830d7245c40174f3d358462dc9b Mon Sep 17 00:00:00 2001 From: Haydn Jones Date: Wed, 20 Aug 2025 19:22:48 -0400 Subject: [PATCH 03/40] oopsy woopsy --- olmocr/pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 73d6603..143664d 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -610,7 +610,6 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non if unknown_args: cmd.extend(unknown_args) - breakpoint() proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, From 261c722f561e06f100174ebd17e5b1b0aea5c1e2 Mon Sep 17 00:00:00 2001 From: Haydn Jones Date: Thu, 21 Aug 2025 17:49:07 -0400 Subject: [PATCH 04/40] Update README + arg name --- README.md | 21 +++++++++++++++++++-- olmocr/pipeline.py | 18 +++++++++--------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index fdda60b..024b641 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/* With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. +### Using External vLLM Server + +If you have a vLLM server already running elsewhere (or any inference platform implementing the relevant subset of the OpenAI API), you can point olmOCR to use it instead of spawning a local instance: + +```bash +# Use external vLLM server instead of local one +python -m olmocr.pipeline ./localworkspace --server http://remote-server:8000 --markdown --pdfs tests/gnarly_pdfs/*.pdf +``` + +The served model name should be `olmocr`. An example vLLM launch command would be: +```bash +vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384 +``` + #### Viewing Results The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`). @@ -271,7 +285,7 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs olmocr-sample.pdf python -m olmocr.pipeline --help usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--model MODEL] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--apply_filter] [--stats] [--markdown] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--guided_decoding] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--max_model_len MAX_MODEL_LEN] - [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY] + [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--server SERVER] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY] workspace Manager for running millions of PDFs through a batch inference pipeline @@ -303,7 +317,7 @@ options: Maximum amount of anchor text to use (characters), not used for new models --guided_decoding Enable guided decoding for model YAML type outputs -VLLM Forwarded arguments: +VLLM arguments: --gpu-memory-utilization GPU_MEMORY_UTILIZATION Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve). --max_model_len MAX_MODEL_LEN @@ -313,6 +327,9 @@ VLLM Forwarded arguments: --data-parallel-size DATA_PARALLEL_SIZE, -dp DATA_PARALLEL_SIZE Data parallel size for vLLM --port PORT Port to use for the VLLM server + --server SERVER URL of external vLLM (or other compatible provider) + server (e.g., http://hostname:port). If provided, + skips spawning local vLLM instance beaker/cluster execution: --beaker Submit this job to beaker instead of running locally diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 143664d..78ca51b 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -213,8 +213,8 @@ async def apost(url, json_data): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: - if args.external_vllm_url: - COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions" + if args.server: + COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions" else: COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" MAX_RETRIES = args.max_page_retries @@ -736,8 +736,8 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non async def vllm_server_ready(args): max_attempts = 300 delay_sec = 1 - if args.external_vllm_url: - url = f"{args.external_vllm_url.rstrip('/')}/v1/models" + if args.server: + url = f"{args.server.rstrip('/')}/v1/models" else: url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" @@ -1076,7 +1076,7 @@ async def main(): vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") vllm_group.add_argument( - "--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" + "--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" ) # Beaker/job running stuff @@ -1216,14 +1216,14 @@ async def main(): # If you get this far, then you are doing inference and need a GPU # check_sglang_version() - if not args.external_vllm_url: + if not args.server: check_torch_gpu_available() logger.info(f"Starting pipeline with PID {os.getpid()}") # Download the model before you do anything else - if args.external_vllm_url: - logger.info(f"Using external vLLM server at {args.external_vllm_url}") + if args.server: + logger.info(f"Using external server at {args.server}") model_name_or_path = None else: model_name_or_path = await download_model(args.model) @@ -1242,7 +1242,7 @@ async def main(): # Start local vLLM instance if not using external one vllm_server = None - if not args.external_vllm_url: + if not args.server: vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args)) await vllm_server_ready(args) From 2c638366489ad989919da84381d9b39c8458f824 Mon Sep 17 00:00:00 2001 From: Haydn Jones Date: Sat, 23 Aug 2025 20:07:05 -0400 Subject: [PATCH 05/40] Black and mock --- olmocr/pipeline.py | 4 +++- tests/test_pipeline.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 78ca51b..04a2170 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -1076,7 +1076,9 @@ async def main(): vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") vllm_group.add_argument( - "--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" + "--server", + type=str, + help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance", ) # Beaker/job running stuff diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index e0d69d9..600753d 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -192,6 +192,7 @@ class MockArgs: max_page_retries: int = 8 target_longest_image_dim: int = 1288 guided_decoding: bool = False + server: str | None = None class TestRotationCorrection: From c7aa217281eccb7bc5a921757b2e2b1c9df761e0 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 25 Aug 2025 20:12:10 +0000 Subject: [PATCH 06/40] Scripts to run benchmarks better --- .gitignore | 3 ++ scripts/run_benchmark.sh | 68 +++++++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index da29165..8f9d32b 100644 --- a/.gitignore +++ b/.gitignore @@ -22,8 +22,11 @@ table_data*/ /synth*/ dolma_samples/* old_train/ +filtered_items/ +filtered_items_prefilter/ augraphy_cache/ /*.html +html_templates*/ scoreelo.csv debug.log birrpipeline-debug.log diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 0c9863a..8725cb7 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -10,15 +10,25 @@ set -e # Parse command line arguments MODEL="" +B200_MODE="" +BENCH_BRANCH="" while [[ $# -gt 0 ]]; do case $1 in --model) MODEL="$2" shift 2 ;; + --b200) + B200_MODE="true" + shift + ;; + --benchbranch) + BENCH_BRANCH="$2" + shift 2 + ;; *) echo "Unknown option: $1" - echo "Usage: $0 [--model MODEL_NAME]" + echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]" exit 1 ;; esac @@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py import sys from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar -# Get image tag, beaker user, git branch, git hash, and optional model from command line +# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line image_tag = sys.argv[1] beaker_user = sys.argv[2] git_branch = sys.argv[3] git_hash = sys.argv[4] -model = sys.argv[5] if len(sys.argv) > 5 else None +model = None +b200_mode = False +bench_branch = None + +# Parse remaining arguments +arg_idx = 5 +while arg_idx < len(sys.argv): + if sys.argv[arg_idx] == "--b200": + b200_mode = True + arg_idx += 1 + elif sys.argv[arg_idx] == "--benchbranch": + bench_branch = sys.argv[arg_idx + 1] + arg_idx += 2 + else: + model = sys.argv[arg_idx] + arg_idx += 1 # Initialize Beaker client b = Beaker.from_env(default_workspace="ai2/olmocr") @@ -111,11 +136,18 @@ if has_aws_creds: "mkdir -p ~/.aws", 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' ]) + +# Build git clone command with optional branch +git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench" +if bench_branch: + git_clone_cmd += f" -b {bench_branch}" + commands.extend([ - "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + git_clone_cmd, "cd olmOCR-bench && git lfs pull && cd ..", pipeline_cmd, "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", + "aws s3 cp --recursive localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ]) @@ -132,7 +164,7 @@ task_spec_args = { preemptible=True, ), "resources": TaskResources(gpu_count=1), - "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), "result": ResultSpec(path="/noop-results"), } @@ -181,9 +213,9 @@ perf_task_spec_args = { priority=Priority.normal, preemptible=True, ), - # Need to reserve all 8 gpus for performance spec or else benchmark results can be off - "resources": TaskResources(gpu_count=8), - "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + # Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode) + "resources": TaskResources(gpu_count=1 if b200_mode else 8), + "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), "result": ResultSpec(path="/noop-results"), } @@ -208,13 +240,27 @@ EOF # Run the Python script to create the experiments echo "Creating Beaker experiments..." + +# Build command with appropriate arguments +CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH" + if [ -n "$MODEL" ]; then echo "Using model: $MODEL" - $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL" -else - $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH + CMD="$CMD $MODEL" fi +if [ -n "$B200_MODE" ]; then + echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task" + CMD="$CMD --b200" +fi + +if [ -n "$BENCH_BRANCH" ]; then + echo "Using bench branch: $BENCH_BRANCH" + CMD="$CMD --benchbranch $BENCH_BRANCH" +fi + +eval $CMD + # Clean up temporary file rm /tmp/run_benchmark_experiment.py From ad33672781f7cef03949d36e0aab46545d0365b7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 25 Aug 2025 21:04:53 +0000 Subject: [PATCH 07/40] fix --- scripts/run_benchmark.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 8725cb7..e0eb6e4 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -147,7 +147,8 @@ commands.extend([ "cd olmOCR-bench && git lfs pull && cd ..", pipeline_cmd, "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", - "aws s3 cp --recursive localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/", + "pip install s5cmd", + "s5cmd cp localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ]) From 6be12c2e06fe6036a968bfd05daca2ca29e4d3ed Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 25 Aug 2025 22:01:24 +0000 Subject: [PATCH 08/40] Baseline tests for blanks --- olmocr/bench/tests.py | 75 ++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index 320d31a..dc461f0 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -5,7 +5,7 @@ import unicodedata from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import asdict, dataclass, field from enum import Enum -from typing import List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple, Union import numpy as np from bs4 import BeautifulSoup @@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str: md_content = re.sub(r"\*(.*?)\*", r"\1", md_content) md_content = re.sub(r"_(.*?)_", r"\1", md_content) - # Convert down to a consistent unicode form, so é == e + accent, unicode forms + # Convert down to a consistent unicode form, so é == e + accent, unicode forms md_content = unicodedata.normalize("NFC", md_content) # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too @@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest): """ + max_length: Optional[int] = None # Used to implement blank page checks + max_repeats: int = 30 check_disallowed_characters: bool = True def run(self, content: str) -> Tuple[bool, str]: - if len("".join(c for c in content if c.isalnum()).strip()) == 0: + base_content_len = len("".join(c for c in content if c.isalnum()).strip()) + + # If this a blank page check, then it short circuits the rest of the checks + if self.max_length is not None: + if base_content_len > self.max_length: + return False, f"{base_content_len} characters were output for a page we expected to be blank" + else: + return True, "" + + if base_content_len == 0: return False, "The text contains no alpha numeric characters" # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality @@ -965,6 +976,45 @@ class MathTest(BasePDFTest): return False, f"No match found for {self.math} anywhere in content" +def load_single_test(data: Union[str, Dict]) -> BasePDFTest: + """ + Load a single test from a JSON line string or JSON object. + + Args: + data: Either a JSON string to parse or a dictionary containing test data. + + Returns: + A test object of the appropriate type. + + Raises: + ValidationError: If the test type is unknown or data is invalid. + json.JSONDecodeError: If the string cannot be parsed as JSON. + """ + # Handle JSON string input + if isinstance(data, str): + data = data.strip() + if not data: + raise ValueError("Empty string provided") + data = json.loads(data) + + # Process the test data + test_type = data.get("type") + if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: + test = TextPresenceTest(**data) + elif test_type == TestType.ORDER.value: + test = TextOrderTest(**data) + elif test_type == TestType.TABLE.value: + test = TableTest(**data) + elif test_type == TestType.MATH.value: + test = MathTest(**data) + elif test_type == TestType.BASELINE.value: + test = BaselineTest(**data) + else: + raise ValidationError(f"Unknown test type: {test_type}") + + return test + + def load_tests(jsonl_file: str) -> List[BasePDFTest]: """ Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor. @@ -976,7 +1026,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: A list of test objects. """ - def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]: + def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]: """ Process a single line from the JSONL file and return a tuple of (line_number, test object). Returns None for empty lines. @@ -987,20 +1037,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: return None try: - data = json.loads(line) - test_type = data.get("type") - if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: - test = TextPresenceTest(**data) - elif test_type == TestType.ORDER.value: - test = TextOrderTest(**data) - elif test_type == TestType.TABLE.value: - test = TableTest(**data) - elif test_type == TestType.MATH.value: - test = MathTest(**data) - elif test_type == TestType.BASELINE.value: - test = BaselineTest(**data) - else: - raise ValidationError(f"Unknown test type: {test_type}") + test = load_single_test(line) return (line_number, test) except json.JSONDecodeError as e: print(f"Error parsing JSON on line {line_number}: {e}") @@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: # Use a ThreadPoolExecutor to process each line in parallel. with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor: # Submit all tasks concurrently. - futures = {executor.submit(process_line, item): item[0] for item in lines} + futures = {executor.submit(process_line_with_number, item): item[0] for item in lines} # Use tqdm to show progress as futures complete. for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"): result = future.result() From 3eec58012c7b96f0880a8ec5f89b2bf5af2655a2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 26 Aug 2025 17:52:50 +0000 Subject: [PATCH 09/40] Docker ignore --- .dockerignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 6f6d633..919c2da 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,6 +9,9 @@ __pycache__ # Let's not copy any bash scripts from the scripts folder over, otherwise trashing the docker image too much with recent changes scripts/*.sh +scripts/**/*.sh # Nor copy any olmocr bench files -olmOCR-bench/ \ No newline at end of file +olmOCR-bench/ +olmOCR-bench*/ +html_templates*/ \ No newline at end of file From 03c7479a17014f4e104756c863a22678c9ae26c2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 27 Aug 2025 16:33:37 +0000 Subject: [PATCH 10/40] VLLM version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 163c5eb..694c0de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md" [project.optional-dependencies] gpu = [ - "vllm==0.10.0" + "vllm==0.10.1.1" ] dev = [ From 27792664bfdb5c6a2a1a457206ff396edd7ac5e7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 27 Aug 2025 16:35:51 +0000 Subject: [PATCH 11/40] Transformers version bump needed also --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 694c0de..370be61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "boto3", "httpx", "torch>=2.7.0", - "transformers==4.53.2", + "transformers==4.55.2", "img2pdf", "beaker-py", ] From edd098093b31de31431e2c9a95ad4b53f7b4efe2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 27 Aug 2025 18:55:26 +0000 Subject: [PATCH 12/40] Reverting version changes that broke, vllm 0.10.1 is not good --- olmOCR-bench-0825 | 1 + olmOCR-bench-snapshot-082225 | 1 + pyproject.toml | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) create mode 160000 olmOCR-bench-0825 create mode 160000 olmOCR-bench-snapshot-082225 diff --git a/olmOCR-bench-0825 b/olmOCR-bench-0825 new file mode 160000 index 0000000..a0100ab --- /dev/null +++ b/olmOCR-bench-0825 @@ -0,0 +1 @@ +Subproject commit a0100ab4cce52d7419cc09cce21aa42226118df2 diff --git a/olmOCR-bench-snapshot-082225 b/olmOCR-bench-snapshot-082225 new file mode 160000 index 0000000..eaa8289 --- /dev/null +++ b/olmOCR-bench-snapshot-082225 @@ -0,0 +1 @@ +Subproject commit eaa828947384ffce68f08c223a0f5f4e2f2df624 diff --git a/pyproject.toml b/pyproject.toml index 370be61..163c5eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "boto3", "httpx", "torch>=2.7.0", - "transformers==4.55.2", + "transformers==4.53.2", "img2pdf", "beaker-py", ] @@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md" [project.optional-dependencies] gpu = [ - "vllm==0.10.1.1" + "vllm==0.10.0" ] dev = [ From f3cdc78b4f4a524889e3f9a48bd3b99330c928aa Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Sun, 31 Aug 2025 03:12:30 +0000 Subject: [PATCH 13/40] Pushing new version --- olmocr/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmocr/version.py b/olmocr/version.py index bf1c1af..79a242e 100644 --- a/olmocr/version.py +++ b/olmocr/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "3" +_PATCH = "4" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" From 56b08d5aa4edd9de841b24184411a90e92a9e065 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Sun, 31 Aug 2025 03:12:39 +0000 Subject: [PATCH 14/40] Bump version to v0.3.4 for release --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86f22d1..f711eb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31 + ## [v0.3.3](https://github.com/allenai/olmocr/releases/tag/v0.3.3) - 2025-08-15 ## [v0.3.2](https://github.com/allenai/olmocr/releases/tag/v0.3.2) - 2025-08-14 From c720c02d832d0e56e81fad7b83022996254c7040 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 2 Sep 2025 06:45:24 +0000 Subject: [PATCH 15/40] Cleaning up repo a bit --- .gitignore | 2 ++ olmOCR-bench-0825 | 1 - olmOCR-bench-snapshot-082225 | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) delete mode 160000 olmOCR-bench-0825 delete mode 160000 olmOCR-bench-snapshot-082225 diff --git a/.gitignore b/.gitignore index 8f9d32b..1b40340 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,8 @@ old_train/ gpt4otestset_output/* pdfs/* olmOCR-bench/* +olmOCR-bench-0825/ +olmOCR-bench-snapshot-082225/ table_data*/ /synth*/ dolma_samples/* diff --git a/olmOCR-bench-0825 b/olmOCR-bench-0825 deleted file mode 160000 index a0100ab..0000000 --- a/olmOCR-bench-0825 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a0100ab4cce52d7419cc09cce21aa42226118df2 diff --git a/olmOCR-bench-snapshot-082225 b/olmOCR-bench-snapshot-082225 deleted file mode 160000 index eaa8289..0000000 --- a/olmOCR-bench-snapshot-082225 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit eaa828947384ffce68f08c223a0f5f4e2f2df624 From 8f88a98e5d10d9b2fc8bdba18a88f2fb62ce78fc Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 4 Sep 2025 22:15:55 +0000 Subject: [PATCH 16/40] prepare checkpoint script fixes --- olmocr/train/prepare_checkpoint.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/olmocr/train/prepare_checkpoint.py b/olmocr/train/prepare_checkpoint.py index 1ebe4c6..e8b6443 100755 --- a/olmocr/train/prepare_checkpoint.py +++ b/olmocr/train/prepare_checkpoint.py @@ -31,6 +31,7 @@ Examples: import argparse import concurrent.futures +import fnmatch import json import os import shutil @@ -59,11 +60,20 @@ TOKENIZER_FILES = ["chat_template.json", "merges.txt", "preprocessor_config.json SUPPORTED_ARCHITECTURES = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"] # Files to exclude from copying (training-related files) -EXCLUDED_FILES = {"optimizer.pt", "scheduler.pt", "rng_state.pth", "trainer_state.json", "training_args.bin"} +# Supports exact matches and glob patterns +EXCLUDED_FILES = {"optimizer.pt", "scheduler.pt", "rng_state.pth", "trainer_state.json", "training_args.bin", "*.pt", "*.pth"} s3_client = boto3.client("s3") +def should_exclude_file(filename: str) -> bool: + """Check if a file should be excluded based on EXCLUDED_FILES patterns.""" + for pattern in EXCLUDED_FILES: + if fnmatch.fnmatch(filename, pattern): + return True + return False + + def is_s3_path(path: str) -> bool: """Check if a path is an S3 path.""" return path.startswith("s3://") @@ -123,7 +133,7 @@ def copy_local_to_local(source_dir: str, dest_dir: str) -> None: files_to_copy = [] for root, _, files in os.walk(source_dir): for file in files: - if file in EXCLUDED_FILES: + if should_exclude_file(file): print(f"Skipping excluded file: {file}") continue src_path = os.path.join(root, file) @@ -164,7 +174,7 @@ def copy_s3_to_local(source_bucket: str, source_prefix: str, dest_dir: str) -> N continue filename = os.path.basename(key) - if filename in EXCLUDED_FILES: + if should_exclude_file(filename): print(f"Skipping excluded file: {filename}") continue @@ -187,7 +197,7 @@ def copy_local_to_s3(source_dir: str, dest_bucket: str, dest_prefix: str) -> Non upload_tasks = [] for root, _, files in os.walk(source_dir): for file in files: - if file in EXCLUDED_FILES: + if should_exclude_file(file): print(f"Skipping excluded file: {file}") continue local_path = os.path.join(root, file) @@ -218,7 +228,7 @@ def copy_s3_to_s3(source_bucket: str, source_prefix: str, dest_bucket: str, dest continue filename = os.path.basename(key) - if filename in EXCLUDED_FILES: + if should_exclude_file(filename): print(f"Skipping excluded file: {filename}") continue From fe425fde209b14d2801f885c45e9ec129f16c586 Mon Sep 17 00:00:00 2001 From: Charitarth Chugh <37895518+charitarthchugh@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:29:49 -0400 Subject: [PATCH 17/40] Add chunked prefill and limit mm per prompt options --- olmocr/pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 04a2170..65ea7f1 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -599,6 +599,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non str(args.tensor_parallel_size), "--data-parallel-size", str(args.data_parallel_size), + "--enable-chunked-prefill", + "--limit-mm-per-prompt '{\"video\": 0}'" ] if args.gpu_memory_utilization is not None: From 7fe3f65de7da70e0bedf42462dc2abadf9750499 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 11:06:51 -0700 Subject: [PATCH 18/40] added support for deepinfra --- DEEPINFRA_SETUP.md | 64 +++++++++++++++++++++++++++++++++++++ olmocr/pipeline.py | 78 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 124 insertions(+), 18 deletions(-) create mode 100644 DEEPINFRA_SETUP.md diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md new file mode 100644 index 0000000..5e18381 --- /dev/null +++ b/DEEPINFRA_SETUP.md @@ -0,0 +1,64 @@ +# Using olmOCR with DeepInfra + +This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference. + +## Prerequisites + +1. **DeepInfra Account**: Sign up at https://deepinfra.com/ +2. **API Key**: Get your API key from the DeepInfra dashboard +3. **olmOCR**: Ensure you have the modified version with authentication support + +## Setup + +### 1. Get your DeepInfra API Key + +1. Log in to https://deepinfra.com/ +2. Navigate to your dashboard +3. Generate or copy your API key +4. Store it securely (recommended: as an environment variable) + +```bash +export DEEPINFRA_API_KEY="your-api-key-here" +``` + +### 2. Usage + +Run olmOCR with the DeepInfra server endpoint: + +```bash +python -m olmocr.pipeline ./localworkspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --markdown \ + --pdfs path/to/your/*.pdf +``` + +### Command Line Arguments + +- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` +- `--api_key`: Your DeepInfra API key (or use environment variable) +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- Other arguments work the same as with local inference + +### Example with S3 Storage + +For large-scale processing with S3: + +```bash +python -m olmocr.pipeline s3://your-bucket/workspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --pdfs s3://your-bucket/pdfs/*.pdf \ + --workers 10 \ + --markdown +``` + +## Pricing + +As of 2024, DeepInfra charges for the olmOCR model: +- Input tokens: ~$0.27 per million tokens +- Output tokens: ~$0.81 per million tokens + +Check current pricing at: https://deepinfra.com/pricing \ No newline at end of file diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 04a2170..2b57a94 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -11,6 +11,7 @@ import os import random import re import shutil +import ssl import sys import tempfile import time @@ -104,7 +105,7 @@ class PageResult: is_fallback: bool -async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0) -> dict: +async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0, model_name: str = "olmocr") -> dict: MAX_TOKENS = 4500 assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query" @@ -132,7 +133,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") return { - "model": "olmocr", + "model": model_name, "messages": [ { "role": "user", @@ -151,25 +152,44 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ # It feels strange perhaps, but httpx and aiohttp are very complex beasts # Ex. the sessionpool in httpcore has 4 different locks in it, and I've noticed # that at the scale of 100M+ requests, that they deadlock in different strange ways -async def apost(url, json_data): +async def apost(url, json_data, api_key=None): parsed_url = urlparse(url) host = parsed_url.hostname - port = parsed_url.port or 80 + # Default to 443 for HTTPS, 80 for HTTP + if parsed_url.scheme == 'https': + port = parsed_url.port or 443 + use_ssl = True + else: + port = parsed_url.port or 80 + use_ssl = False path = parsed_url.path or "/" writer = None try: - reader, writer = await asyncio.open_connection(host, port) + if use_ssl: + ssl_context = ssl.create_default_context() + reader, writer = await asyncio.open_connection(host, port, ssl=ssl_context) + else: + reader, writer = await asyncio.open_connection(host, port) json_payload = json.dumps(json_data) - request = ( - f"POST {path} HTTP/1.1\r\n" - f"Host: {host}\r\n" - f"Content-Type: application/json\r\n" - f"Content-Length: {len(json_payload)}\r\n" - f"Connection: close\r\n\r\n" - f"{json_payload}" - ) + + # Build request headers + headers = [ + f"POST {path} HTTP/1.1", + f"Host: {host}", + f"Content-Type: application/json", + f"Content-Length: {len(json_payload)}", + ] + + # Add Authorization header if API key is provided + if api_key: + headers.append(f"Authorization: Bearer {api_key}") + + headers.append("Connection: close") + + # Construct the full request + request = "\r\n".join(headers) + "\r\n\r\n" + json_payload writer.write(request.encode()) await writer.drain() @@ -214,7 +234,13 @@ async def apost(url, json_data): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: if args.server: - COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions" + server_url = args.server.rstrip('/') + # Check if the server URL already contains '/v1/openai' (DeepInfra case) + if '/v1/openai' in server_url: + COMPLETION_URL = f"{server_url}/chat/completions" + else: + COMPLETION_URL = f"{server_url}/v1/chat/completions" + logger.debug(f"Using completion URL: {COMPLETION_URL}") else: COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" MAX_RETRIES = args.max_page_retries @@ -227,11 +253,14 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: while attempt < MAX_RETRIES: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) + # Use the model name from args if provided, otherwise default to 'olmocr' + model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr' query = await build_page_query( pdf_local_path, page_num, args.target_longest_image_dim, image_rotation=cumulative_rotation, + model_name=model_name, ) # Change temperature as number of attempts increases to overcome repetition issues at expense of quality query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt] @@ -245,7 +274,9 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - status_code, response_body = await apost(COMPLETION_URL, json_data=query) + # Pass API key if provided + api_key = getattr(args, 'api_key', None) + status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key) if status_code == 400: raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response") @@ -737,14 +768,24 @@ async def vllm_server_ready(args): max_attempts = 300 delay_sec = 1 if args.server: - url = f"{args.server.rstrip('/')}/v1/models" + # Check if the server URL already contains '/v1/openai' (DeepInfra case) + server_url = args.server.rstrip('/') + if '/v1/openai' in server_url: + url = f"{server_url}/models" + else: + url = f"{server_url}/v1/models" else: url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" for attempt in range(1, max_attempts + 1): try: + # Add authentication headers if API key is provided + headers = {} + if args.server and hasattr(args, 'api_key') and args.api_key: + headers['Authorization'] = f'Bearer {args.api_key}' + async with httpx.AsyncClient() as session: - response = await session.get(url) + response = await session.get(url, headers=headers) if response.status_code == 200: logger.info("vllm server is ready.") @@ -1064,7 +1105,8 @@ async def main(): parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288) parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1) parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs") - + parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)') + vllm_group = parser.add_argument_group( "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." ) From 2a5792e5ed9044f14577706f1e4ca942448a5bbf Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:29:48 -0700 Subject: [PATCH 19/40] add if else for vllm local usage bug for api argument --- olmocr/pipeline.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 2b57a94..90ffce0 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -253,8 +253,13 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: while attempt < MAX_RETRIES: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) - # Use the model name from args if provided, otherwise default to 'olmocr' - model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr' + # For external servers (like DeepInfra), use the model name from args + # For local inference, always use 'olmocr' + if args.server and hasattr(args, 'model'): + model_name = args.model + else: + model_name = 'olmocr' + query = await build_page_query( pdf_local_path, page_num, @@ -274,8 +279,11 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - # Pass API key if provided - api_key = getattr(args, 'api_key', None) + # Pass API key only for external servers that need authentication + if args.server and hasattr(args, 'api_key'): + api_key = args.api_key + else: + api_key = None status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key) if status_code == 400: From 90589e16de31dfc742bd53959100eaa2bea9824d Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:56:34 -0700 Subject: [PATCH 20/40] Added deepinfra usage to readme --- DEEPINFRA_SETUP.md | 64 ---------------------------------------------- README.md | 20 +++++++++++++++ 2 files changed, 20 insertions(+), 64 deletions(-) delete mode 100644 DEEPINFRA_SETUP.md diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md deleted file mode 100644 index 5e18381..0000000 --- a/DEEPINFRA_SETUP.md +++ /dev/null @@ -1,64 +0,0 @@ -# Using olmOCR with DeepInfra - -This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference. - -## Prerequisites - -1. **DeepInfra Account**: Sign up at https://deepinfra.com/ -2. **API Key**: Get your API key from the DeepInfra dashboard -3. **olmOCR**: Ensure you have the modified version with authentication support - -## Setup - -### 1. Get your DeepInfra API Key - -1. Log in to https://deepinfra.com/ -2. Navigate to your dashboard -3. Generate or copy your API key -4. Store it securely (recommended: as an environment variable) - -```bash -export DEEPINFRA_API_KEY="your-api-key-here" -``` - -### 2. Usage - -Run olmOCR with the DeepInfra server endpoint: - -```bash -python -m olmocr.pipeline ./localworkspace \ - --server https://api.deepinfra.com/v1/openai \ - --api_key $DEEPINFRA_API_KEY \ - --model allenai/olmOCR-7B-0725-FP8 \ - --markdown \ - --pdfs path/to/your/*.pdf -``` - -### Command Line Arguments - -- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` -- `--api_key`: Your DeepInfra API key (or use environment variable) -- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` -- Other arguments work the same as with local inference - -### Example with S3 Storage - -For large-scale processing with S3: - -```bash -python -m olmocr.pipeline s3://your-bucket/workspace \ - --server https://api.deepinfra.com/v1/openai \ - --api_key $DEEPINFRA_API_KEY \ - --model allenai/olmOCR-7B-0725-FP8 \ - --pdfs s3://your-bucket/pdfs/*.pdf \ - --workers 10 \ - --markdown -``` - -## Pricing - -As of 2024, DeepInfra charges for the olmOCR model: -- Input tokens: ~$0.27 per million tokens -- Output tokens: ~$0.81 per million tokens - -Check current pricing at: https://deepinfra.com/pricing \ No newline at end of file diff --git a/README.md b/README.md index 024b641..e385e70 100644 --- a/README.md +++ b/README.md @@ -249,6 +249,26 @@ For example: ```bash python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf --beaker --beaker_gpus 4 ``` +### Using DeepInfra +Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard. +Store the API key as an environment variable. +```bash +export DEEPINFRA_API_KEY="your-api-key-here" +``` +#### Run olmOCR with the DeepInfra server endpoint: +```bash +python -m olmocr.pipeline ./localworkspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --markdown \ + --pdfs path/to/your/*.pdf +``` +- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` +- `--api_key`: Your DeepInfra API key +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- Other arguments work the same as with local inference + ### Using Docker From e7ae5e6240c34a22bb32ba51fd59743c6be784f5 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:58:34 -0700 Subject: [PATCH 21/40] fixed style --- olmocr/pipeline.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 90ffce0..b64fb60 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -156,7 +156,7 @@ async def apost(url, json_data, api_key=None): parsed_url = urlparse(url) host = parsed_url.hostname # Default to 443 for HTTPS, 80 for HTTP - if parsed_url.scheme == 'https': + if parsed_url.scheme == "https": port = parsed_url.port or 443 use_ssl = True else: @@ -234,9 +234,9 @@ async def apost(url, json_data, api_key=None): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: if args.server: - server_url = args.server.rstrip('/') + server_url = args.server.rstrip("/") # Check if the server URL already contains '/v1/openai' (DeepInfra case) - if '/v1/openai' in server_url: + if "/v1/openai" in server_url: COMPLETION_URL = f"{server_url}/chat/completions" else: COMPLETION_URL = f"{server_url}/v1/chat/completions" @@ -255,10 +255,10 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) # For external servers (like DeepInfra), use the model name from args # For local inference, always use 'olmocr' - if args.server and hasattr(args, 'model'): + if args.server and hasattr(args, "model"): model_name = args.model else: - model_name = 'olmocr' + model_name = "olmocr" query = await build_page_query( pdf_local_path, @@ -280,7 +280,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: try: # Pass API key only for external servers that need authentication - if args.server and hasattr(args, 'api_key'): + if args.server and hasattr(args, "api_key"): api_key = args.api_key else: api_key = None @@ -777,8 +777,8 @@ async def vllm_server_ready(args): delay_sec = 1 if args.server: # Check if the server URL already contains '/v1/openai' (DeepInfra case) - server_url = args.server.rstrip('/') - if '/v1/openai' in server_url: + server_url = args.server.rstrip("/") + if "/v1/openai" in server_url: url = f"{server_url}/models" else: url = f"{server_url}/v1/models" @@ -789,8 +789,8 @@ async def vllm_server_ready(args): try: # Add authentication headers if API key is provided headers = {} - if args.server and hasattr(args, 'api_key') and args.api_key: - headers['Authorization'] = f'Bearer {args.api_key}' + if args.server and hasattr(args, "api_key") and args.api_key: + headers["Authorization"] = f"Bearer {args.api_key}" async with httpx.AsyncClient() as session: response = await session.get(url, headers=headers) @@ -1113,8 +1113,8 @@ async def main(): parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288) parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1) parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs") - parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)') - + parser.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)") + vllm_group = parser.add_argument_group( "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." ) From 556ff26d585c860194c887c5042eadf5d2878bfb Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:08:40 -0700 Subject: [PATCH 22/40] fixed lint, style, ruff --- olmocr/pipeline.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index b64fb60..1c5febb 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -174,7 +174,6 @@ async def apost(url, json_data, api_key=None): json_payload = json.dumps(json_data) - # Build request headers headers = [ f"POST {path} HTTP/1.1", f"Host: {host}", @@ -182,18 +181,15 @@ async def apost(url, json_data, api_key=None): f"Content-Length: {len(json_payload)}", ] - # Add Authorization header if API key is provided if api_key: headers.append(f"Authorization: Bearer {api_key}") headers.append("Connection: close") - # Construct the full request request = "\r\n".join(headers) + "\r\n\r\n" + json_payload writer.write(request.encode()) await writer.drain() - # Read status line status_line = await reader.readline() if not status_line: raise ConnectionError("No response from server") @@ -279,7 +275,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - # Pass API key only for external servers that need authentication + # Passing API key only for external servers that need authentication if args.server and hasattr(args, "api_key"): api_key = args.api_key else: @@ -787,7 +783,6 @@ async def vllm_server_ready(args): for attempt in range(1, max_attempts + 1): try: - # Add authentication headers if API key is provided headers = {} if args.server and hasattr(args, "api_key") and args.api_key: headers["Authorization"] = f"Bearer {args.api_key}" From 359abef6547dec814c4cbaaa57b1ef4e26641888 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:19:22 -0700 Subject: [PATCH 23/40] updated pytests --- tests/test_pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 600753d..71a2f3c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -268,9 +268,9 @@ This is the corrected text from the document.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): @@ -376,9 +376,9 @@ Document is now correctly oriented after 180 degree rotation.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): @@ -482,9 +482,9 @@ Document correctly oriented at 90 degrees total rotation.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): From f3c4073395a967e038b0ab092b1a6d8ed12adcb5 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:25:25 -0700 Subject: [PATCH 24/40] added Api_key argument to pipeline pytests --- tests/test_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 71a2f3c..1541639 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -209,7 +209,7 @@ class TestRotationCorrection: # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1 @@ -311,7 +311,7 @@ This is the corrected text from the document.""" # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1 @@ -420,7 +420,7 @@ Document is now correctly oriented after 180 degree rotation.""" # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1 From 9c750903cef946c6cdaff9e5ec4197c831cde970 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:06:14 +0000 Subject: [PATCH 25/40] Ignore files --- .dockerignore | 3 ++- .gitignore | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.dockerignore b/.dockerignore index 919c2da..0f3fbb2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,4 +14,5 @@ scripts/**/*.sh # Nor copy any olmocr bench files olmOCR-bench/ olmOCR-bench*/ -html_templates*/ \ No newline at end of file +html_templates*/ +olmocr-synthmix-*/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1b40340..77b9cb3 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,6 @@ old_train/ gpt4otestset_output/* pdfs/* olmOCR-bench/* -olmOCR-bench-0825/ -olmOCR-bench-snapshot-082225/ table_data*/ /synth*/ dolma_samples/* @@ -29,6 +27,7 @@ filtered_items_prefilter/ augraphy_cache/ /*.html html_templates*/ +olmocr-synthmix*/ scoreelo.csv debug.log birrpipeline-debug.log From 0c6d88986352862381b29a4d2396aea6f8f1a7df Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:26:22 +0000 Subject: [PATCH 26/40] Adding retry code on 429 errors from exteranl providers --- README.md | 2 ++ olmocr/pipeline.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index e385e70..23a87a4 100644 --- a/README.md +++ b/README.md @@ -260,12 +260,14 @@ export DEEPINFRA_API_KEY="your-api-key-here" python -m olmocr.pipeline ./localworkspace \ --server https://api.deepinfra.com/v1/openai \ --api_key $DEEPINFRA_API_KEY \ + --pages_per_group 100 \ --model allenai/olmOCR-7B-0725-FP8 \ --markdown \ --pdfs path/to/your/*.pdf ``` - `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` - `--api_key`: Your DeepInfra API key +- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits - `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` - Other arguments work the same as with local inference diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 1c5febb..fb3893f 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -284,6 +284,8 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: if status_code == 400: raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response") + elif status_code == 429: + raise ConnectionError(f"Too many requests, doing exponential backoff") elif status_code == 500: raise ValueError(f"Got InternalServerError from server: {response_body}, skipping this response") elif status_code != 200: From a0bc5a46908d9f32d9c8360c7de1a07ca7588a25 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:29:28 +0000 Subject: [PATCH 27/40] Deepinfra readme --- README.md | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 23a87a4..d12af55 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,29 @@ The served model name should be `olmocr`. An example vLLM launch command would b vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384 ``` +#### Run olmOCR with the DeepInfra server endpoint: +Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard. +Store the API key as an environment variable. +```bash +export DEEPINFRA_API_KEY="your-api-key-here" +``` + +```bash +python -m olmocr.pipeline ./localworkspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --pages_per_group 100 \ + --model allenai/olmOCR-7B-0725-FP8 \ + --markdown \ + --pdfs path/to/your/*.pdf +``` +- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` +- `--api_key`: Your DeepInfra API key +- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- Other arguments work the same as with local inference + + #### Viewing Results The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`). @@ -249,27 +272,6 @@ For example: ```bash python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf --beaker --beaker_gpus 4 ``` -### Using DeepInfra -Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard. -Store the API key as an environment variable. -```bash -export DEEPINFRA_API_KEY="your-api-key-here" -``` -#### Run olmOCR with the DeepInfra server endpoint: -```bash -python -m olmocr.pipeline ./localworkspace \ - --server https://api.deepinfra.com/v1/openai \ - --api_key $DEEPINFRA_API_KEY \ - --pages_per_group 100 \ - --model allenai/olmOCR-7B-0725-FP8 \ - --markdown \ - --pdfs path/to/your/*.pdf -``` -- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` -- `--api_key`: Your DeepInfra API key -- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits -- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` -- Other arguments work the same as with local inference ### Using Docker From c587eb90506c3ca8093fa85b1f5698b5c5cf5ee1 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:36:41 +0000 Subject: [PATCH 28/40] Ugh, release script adds all files by default --- olmocr/version.py | 2 +- scripts/release.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/olmocr/version.py b/olmocr/version.py index 79a242e..cee5284 100644 --- a/olmocr/version.py +++ b/olmocr/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "4" +_PATCH = "6" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" diff --git a/scripts/release.sh b/scripts/release.sh index dc5ab60..718dc29 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -68,7 +68,6 @@ read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then python scripts/prepare_changelog.py - git add -A git commit -m "Bump version to $TAG for release" || true && git push echo "Creating new git tag $TAG" git tag "$TAG" -m "$TAG" From fb1ef9e38af7f9dcfae2b8d8cf77663224c43f8a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:37:14 +0000 Subject: [PATCH 29/40] Release script fix --- scripts/release.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/release.sh b/scripts/release.sh index 718dc29..ef30083 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -68,6 +68,7 @@ read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then python scripts/prepare_changelog.py + git add CHANGELOG.md git commit -m "Bump version to $TAG for release" || true && git push echo "Creating new git tag $TAG" git tag "$TAG" -m "$TAG" From 8982bae756c738ad63962b0a60c304b037665197 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:37:25 +0000 Subject: [PATCH 30/40] Bump version to v0.3.6 for release --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f711eb7..09fd707 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29 + ## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31 ## [v0.3.3](https://github.com/allenai/olmocr/releases/tag/v0.3.3) - 2025-08-15 From f4356de0910ad3243ca57ced27ef730ec5434150 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 29 Sep 2025 17:56:03 +0000 Subject: [PATCH 31/40] deepinfra readme improved --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d12af55..08c0e81 100644 --- a/README.md +++ b/README.md @@ -222,14 +222,14 @@ python -m olmocr.pipeline ./localworkspace \ --server https://api.deepinfra.com/v1/openai \ --api_key $DEEPINFRA_API_KEY \ --pages_per_group 100 \ - --model allenai/olmOCR-7B-0725-FP8 \ + --model allenai/olmOCR-7B-0825 \ --markdown \ --pdfs path/to/your/*.pdf ``` - `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` - `--api_key`: Your DeepInfra API key - `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits -- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0825` - Other arguments work the same as with local inference From 9feb41af82f40a0358614a896dcbaa6793cb7ea8 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 18:57:16 +0000 Subject: [PATCH 32/40] New docker file approach for vllm 0.11 --- Dockerfile | 88 ++++++++++++++++++++++---------------------------- pyproject.toml | 2 +- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2ac06b0..515ecd7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,62 +1,52 @@ -ARG CUDA_VERSION=12.8.1 -FROM --platform=linux/amd64 nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +FROM vllm/vllm-openai:v0.11.0 -# Needs to be repeated below the FROM, or else it's not picked up -ARG PYTHON_VERSION=3.12 -ARG CUDA_VERSION=12.8.1 +ENV PYTHON_VERSION=3.12 +ENV CUSTOM_PY="/usr/bin/python${PYTHON_VERSION}" -# Set environment variable to prevent interactive prompts -ENV DEBIAN_FRONTEND=noninteractive - -# From original VLLM dockerfile https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile -# Install Python and other dependencies -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo python3-apt \ - && for i in 1 2 3; do \ - add-apt-repository -y ppa:deadsnakes/ppa && break || \ - { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ - done \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv - -# olmOCR Specific Installs - Install fonts BEFORE changing Python version -RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections && \ +# Workaround for installing fonts, which are needed for good rendering of documents +RUN DIST_PY=$(ls /usr/bin/python3.[0-9]* | sort -V | head -n1) && \ + # If a python alternative scheme already exists, remember its value so we \ + # can restore it later; otherwise, we will restore to CUSTOM_PY when we \ + # are done. \ + if update-alternatives --query python3 >/dev/null 2>&1; then \ + ORIGINAL_PY=$(update-alternatives --query python3 | awk -F": " '/Value:/ {print $2}'); \ + else \ + ORIGINAL_PY=$CUSTOM_PY; \ + fi && \ + # ---- APT operations that require the distro python3 ------------------- \ + echo "Temporarily switching python3 alternative to ${DIST_PY} so that APT scripts use the distro‑built Python runtime." && \ + update-alternatives --install /usr/bin/python3 python3 ${DIST_PY} 1 && \ + update-alternatives --set python3 ${DIST_PY} && \ + update-alternatives --install /usr/bin/python python ${DIST_PY} 1 && \ + update-alternatives --set python ${DIST_PY} && \ apt-get update -y && \ - apt-get install -y --no-install-recommends poppler-utils fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools ttf-mscorefonts-installer - -# Now update Python alternatives -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version - -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv python3 -m pip install uv - -# Install some helper utilities for things like the benchmark -RUN apt-get update -y && apt-get install -y --no-install-recommends \ - git \ - git-lfs \ - curl \ - wget \ - unzip - -ENV PYTHONUNBUFFERED=1 + apt-get remove -y python3-blinker || true && \ + # Pre‑seed the Microsoft Core Fonts EULA so the build is non‑interactive \ + echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3-apt \ + update-notifier-common \ + poppler-utils \ + fonts-crosextra-caladea \ + fonts-crosextra-carlito \ + gsfonts \ + lcdf-typetools \ + ttf-mscorefonts-installer && \ + # ---- Restore the original / custom Python alternative ----------------- \ + echo "Restoring python3 alternative to ${ORIGINAL_PY}" && \ + update-alternatives --install /usr/bin/python3 python3 ${ORIGINAL_PY} 1 && \ + update-alternatives --set python3 ${ORIGINAL_PY} && \ + update-alternatives --install /usr/bin/python python ${ORIGINAL_PY} 1 || true && \ + update-alternatives --set python ${ORIGINAL_PY} || true && \ + # Ensure pip is available for the restored Python \ + curl -sS https://bootstrap.pypa.io/get-pip.py | ${ORIGINAL_PY} # keep the build context clean WORKDIR /build COPY . /build - # Needed to resolve setuptools dependencies ENV UV_INDEX_STRATEGY="unsafe-best-match" -RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128 -RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl RUN uv pip install --system --no-cache ".[bench]" RUN playwright install-deps diff --git a/pyproject.toml b/pyproject.toml index 163c5eb..3c40b75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md" [project.optional-dependencies] gpu = [ - "vllm==0.10.0" + "vllm==0.11.0" ] dev = [ From 9b517a02be7529f71aca399f589acd42aeeb79c8 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 19:47:19 +0000 Subject: [PATCH 33/40] Git lfs in docker image --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 515ecd7..7db0fea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,7 +31,8 @@ RUN DIST_PY=$(ls /usr/bin/python3.[0-9]* | sort -V | head -n1) && \ fonts-crosextra-carlito \ gsfonts \ lcdf-typetools \ - ttf-mscorefonts-installer && \ + ttf-mscorefonts-installer \ + git git-lfs curl wget unzip && \ # ---- Restore the original / custom Python alternative ----------------- \ echo "Restoring python3 alternative to ${ORIGINAL_PY}" && \ update-alternatives --install /usr/bin/python3 python3 ${ORIGINAL_PY} 1 && \ From 81be6f5c1f4d59511b7ea874408bbf4fc0fc2c19 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 19:52:55 +0000 Subject: [PATCH 34/40] Transformers version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3c40b75..52142a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "boto3", "httpx", "torch>=2.7.0", - "transformers==4.53.2", + "transformers==4.55.2", "img2pdf", "beaker-py", ] From c75f5b98a1d46cf0b75595491faf2c9e67626542 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 20:26:41 +0000 Subject: [PATCH 35/40] Cleaning up pr 341 arguments to match with vllm 0.11, which only has V1 engine and thus always does chunked prefill. And fixes arg syntax --- olmocr/pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 40f72aa..b388c68 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -636,8 +636,7 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non str(args.tensor_parallel_size), "--data-parallel-size", str(args.data_parallel_size), - "--enable-chunked-prefill", - "--limit-mm-per-prompt '{\"video\": 0}'" + "--limit-mm-per-prompt", "{\"video\": 0}" # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh ] if args.gpu_memory_utilization is not None: From 1951a849ec57cd8319ec8b88080345f3270c557a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 21:10:00 +0000 Subject: [PATCH 36/40] Version bump with new vllm --- olmocr/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmocr/version.py b/olmocr/version.py index cee5284..02091a9 100644 --- a/olmocr/version.py +++ b/olmocr/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "6" +_PATCH = "7" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" From 9c7c670f1fd7d4e07ff02a5500606fb084fe72d1 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 21:10:07 +0000 Subject: [PATCH 37/40] Bump version to v0.3.7 for release --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09fd707..c089fea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [v0.3.7](https://github.com/allenai/olmocr/releases/tag/v0.3.7) - 2025-10-06 + ## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29 ## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31 From 7fe756fe6357aedf0beb89055c5c88ea7793a2cd Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 21:10:32 +0000 Subject: [PATCH 38/40] Formatting --- olmocr/pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index b388c68..4d3d1cb 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -636,7 +636,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non str(args.tensor_parallel_size), "--data-parallel-size", str(args.data_parallel_size), - "--limit-mm-per-prompt", "{\"video\": 0}" # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh + "--limit-mm-per-prompt", + '{"video": 0}', # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh ] if args.gpu_memory_utilization is not None: From e12941a608c59d2026ec1da1b04bade0b06e68dd Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 21:46:10 +0000 Subject: [PATCH 39/40] Version bump --- olmocr/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmocr/version.py b/olmocr/version.py index 02091a9..3bbaff7 100644 --- a/olmocr/version.py +++ b/olmocr/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "7" +_PATCH = "8" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" From c89787183a1f9b008d5ef643aa8822115ef4dbfd Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 6 Oct 2025 21:46:18 +0000 Subject: [PATCH 40/40] Bump version to v0.3.8 for release --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c089fea..a1d37d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [v0.3.8](https://github.com/allenai/olmocr/releases/tag/v0.3.8) - 2025-10-06 + ## [v0.3.7](https://github.com/allenai/olmocr/releases/tag/v0.3.7) - 2025-10-06 ## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29