From f21ff08c2f9bc0b4e6cc34208416114c71af584d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 4 Jun 2025 23:10:14 -0700 Subject: [PATCH 01/18] Fix marker benchmarks --- README.md | 26 ++++++++++---------- olmocr/bench/README.md | 38 ++++++++++++++++++++---------- olmocr/bench/runners/run_marker.py | 20 ++++++++++++++-- olmocr/bench/tests.py | 2 ++ scripts/pareto_plot.py | 8 +++---- 5 files changed, 62 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 6e95c54..ee0040b 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,6 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro - - Marker v1.6.2 - 24.3 - 22.1 - 69.8 - 24.3 - 87.1 - 71.0 - 76.9 - 99.5 - 59.4 ± 1.1 - MinerU v1.3.10 75.4 @@ -87,7 +75,7 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro Mistral OCR API - 77.2 + 77.2 67.5 60.6 29.3 @@ -97,6 +85,18 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro 99.4 72.0 ± 1.1 + + Marker v1.7.4 (hybrid) + 77.7 + 71.2 + 78.1 + 32.3 + 83.4 + 73.8 + 79.0 + 99.2 + 74.3 ± 1.1 + olmOCR v0.1.68 (pipeline.py) 75.6 diff --git a/olmocr/bench/README.md b/olmocr/bench/README.md index 8cd0f72..8ab5f26 100644 --- a/olmocr/bench/README.md +++ b/olmocr/bench/README.md @@ -46,16 +46,28 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o 48.3 ± 1.1 - Marker v1.6.2 - 24.3 - 22.1 - 69.8 - 24.3 - 87.1 - 71.0 - 76.9 - 99.5 - 59.4 ± 1.1 + Marker v1.7.4 (base) + 77.7 + 59.6 + 57.9 + 27.8 + 85.3 + 73.5 + 78.7 + 99.1 + 70.0 ± 1.1 + + + Marker v1.7.4 (hybrid) + 77.7 + 71.2 + 78.1 + 32.3 + 83.4 + 73.8 + 79.0 + 99.2 + 74.3 ± 1.1 MinerU v1.3.10 @@ -71,14 +83,14 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o Mistral OCR API - 77.2 + 77.2 67.5 60.6 29.3 93.6 71.3 77.1 - 99.4 + 99.4 72.0 ± 1.1 @@ -121,7 +133,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o Gemini Flash 2 (Anchored) 54.5 56.1 - 72.1 + 72.1 34.2 64.7 61.5 diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py index 58733cd..594a03d 100644 --- a/olmocr/bench/runners/run_marker.py +++ b/olmocr/bench/runners/run_marker.py @@ -4,6 +4,7 @@ import tempfile from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered +from marker.config.parser import ConfigParser from pypdf import PdfReader, PdfWriter _marker_converter = None @@ -12,13 +13,28 @@ _marker_converter = None def run_marker(pdf_path: str, page_num: int = 1) -> str: global _marker_converter + google_key_exists = os.getenv("GOOGLE_API_KEY") is not None + if _marker_converter is None: # Create a configuration dictionary with the necessary settings config = { - "texify_inline_spans": True, # This enables conversion of inline math to LaTeX + "format_lines": True, # This enables conversion of inline math to LaTeX + "use_llm": google_key_exists, # Activate LLM mode if google key is specified + "disable_tqdm": True, # Disable tqdm for cleaner output + "recognition_batch_size": 256, + "layout_batch_size": 48, + "detection_batch_size": 48, + "equation_batch_size": 64, + "table_rec_batch_size": 48, + "ocr_error_batch_size": 64, } + config_parser = ConfigParser(config) - _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config) + _marker_converter = PdfConverter( + artifact_dict=create_model_dict(), + config=config_parser.generate_config_dict(), + llm_service=config_parser.get_llm_service(), + ) # Extract the specific page from the PDF pdf_to_process = pdf_path diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index ec87313..320d31a 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -123,6 +123,8 @@ def normalize_text(md_content: str) -> str: # Remove markdown bold formatting (** or __ for bold) md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content) md_content = re.sub(r"__(.*?)__", r"\1", md_content) + md_content = re.sub(r"", "", md_content) # Remove tags if they exist + md_content = re.sub(r"", "", md_content) # Remove tags if they exist # Remove markdown italics formatting (* or _ for italics) md_content = re.sub(r"\*(.*?)\*", r"\1", md_content) diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py index d3806df..a7a2d03 100644 --- a/scripts/pareto_plot.py +++ b/scripts/pareto_plot.py @@ -64,7 +64,7 @@ data = { "MinerU", "Gemini Flash 2", "Gemini Flash 2 (Batch)", - "Marker v1.6.2", + "Marker v1.7.4", "Ours", "Qwen 2 VL", "Qwen 2.5 VL", @@ -77,7 +77,7 @@ data = { 61.5, # MinerU 63.8, # Gemini Flash 2 (Anchored) 63.8, # Same performance for batch - 59.4, # marker v1.6.2 + 74.3, # marker v1.7.4 hybrid 77.4, # Ours (performance is the same across hardware) 31.5, # Qwen2VL 65.5, # Qwen2.5VL @@ -94,7 +94,7 @@ model_categories = { "MinerU": "Open Source Tool", "Gemini Flash 2": "Commercial VLM", "Gemini Flash 2 (Batch)": "Commercial VLM", - "Marker v1.6.2": "Open Source Tool", + "Marker v1.7.4": "Open Source Tool", "Ours": "Ours", "Qwen 2 VL": "Open VLM", "Qwen 2.5 VL": "Open VLM", @@ -132,7 +132,7 @@ model_label_offsets = { "MinerU": [-15, -20], "Gemini Flash 2": [-10, 10], "Gemini Flash 2 (Batch)": [-50, -15], - "Marker v1.6.2": [-35, -20], + "Marker v1.7.4": [-35, -20], "Ours": [-20, 10], "Qwen 2 VL": [-35, 10], "Qwen 2.5 VL": [-35, 10], From 9ffbe8df46de52f1f9eb4bc45186d1254368a71e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 5 Jun 2025 15:58:19 +0000 Subject: [PATCH 02/18] Adding quick stats percentage done check --- olmocr/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index f65e4fe..f804a5b 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -909,6 +909,7 @@ def print_stats(args, root_work_queue): logger.warning(f"Error processing {s3_path}: {e}") return 0, 0, 0, 0, 0, set(), 0, 0 + print(f"\nCompleted work items {completed_items:,} out of {total_items:,}: {completed_items/total_items*100:.2f}%") print("\nProcessing output files...") docs_total = 0 input_tokens_total = 0 From 267f52bd79c17538fac845200dd5b5738b543a3b Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 6 Jun 2025 13:47:29 -0400 Subject: [PATCH 03/18] Update marker cost --- olmocr/bench/runners/run_marker.py | 2 +- scripts/pareto_plot.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py index 594a03d..6d15643 100644 --- a/olmocr/bench/runners/run_marker.py +++ b/olmocr/bench/runners/run_marker.py @@ -18,7 +18,7 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str: if _marker_converter is None: # Create a configuration dictionary with the necessary settings config = { - "format_lines": True, # This enables conversion of inline math to LaTeX + "force_ocr": True, # This enables conversion of inline math to LaTeX "use_llm": google_key_exists, # Activate LLM mode if google key is specified "disable_tqdm": True, # Disable tqdm for cleaner output "recognition_batch_size": 256, diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py index a7a2d03..9c4bbc7 100644 --- a/scripts/pareto_plot.py +++ b/scripts/pareto_plot.py @@ -69,7 +69,7 @@ data = { "Qwen 2 VL", "Qwen 2.5 VL", ], - COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178], # Same cost as Ours # Same cost as Ours + COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 75, 178, 178, 178], # Same cost as Ours # Same cost as Ours PERF_COLUMN_NAME: [ 69.9, # GPT-4o (Anchored) 69.9, # Same performance for batch @@ -77,7 +77,7 @@ data = { 61.5, # MinerU 63.8, # Gemini Flash 2 (Anchored) 63.8, # Same performance for batch - 74.3, # marker v1.7.4 hybrid + 70.0, # marker v1.7.4 base 77.4, # Ours (performance is the same across hardware) 31.5, # Qwen2VL 65.5, # Qwen2.5VL From 02574447202277675c9e85e00e44da86dded115d Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Fri, 6 Jun 2025 18:52:01 +0000 Subject: [PATCH 04/18] Ok, cleaner retry pattern for model downloading --- olmocr/pipeline.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index f804a5b..4f7d8f5 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -708,19 +708,31 @@ async def sglang_server_ready(): raise Exception("sglang server did not become ready after waiting.") -async def download_model(model_name_or_path: str): - if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"): - logger.info(f"Downloading model directory from '{model_name_or_path}'") - model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model") - download_directory([model_name_or_path], model_cache_dir) - return model_cache_dir - elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path): - logger.info(f"Using local model path at '{model_name_or_path}'") - return model_name_or_path - else: - logger.info(f"Downloading model with hugging face '{model_name_or_path}'") - snapshot_download(repo_id=model_name_or_path) - return model_name_or_path +async def download_model(model_name_or_path: str, max_retries: int=5): + for retry in range(max_retries): + try: + if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"): + logger.info(f"Downloading model directory from '{model_name_or_path}'") + model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model") + # Delete existing model cache directory if it exists + if os.path.exists(model_cache_dir): + shutil.rmtree(model_cache_dir) + download_directory([model_name_or_path], model_cache_dir) + return model_cache_dir + elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path): + logger.info(f"Using local model path at '{model_name_or_path}'") + return model_name_or_path + else: + logger.info(f"Downloading model with hugging face '{model_name_or_path}'") + snapshot_download(repo_id=model_name_or_path) + return model_name_or_path + except Exception: + if retry == max_retries - 1: + raise # Raise on final attempt and fail the job + + sleep_time = random.randrange(2, 20) * 2**retry + logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})") + await asyncio.sleep(random.randrange(10, 30) * 2**retry) async def metrics_reporter(work_queue): @@ -1037,8 +1049,8 @@ async def main(): # Wait a little bit so that not all beaker jobs in a task start at the same time and download the model at the same time replica_count = int(os.environ.get("BEAKER_REPLICA_COUNT", "1")) - interval = 10 if (replica_count - 1) * 10 <= 240 else 240 / max(1, replica_count - 1) - sleep_time = int(int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval) + interval = 10 if (replica_count - 1) * 10 <= 30 else 30 / max(1, replica_count - 1) + sleep_time = int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval logger.info(f"Beaker job sleeping for {sleep_time} seconds to stagger model downloads") await asyncio.sleep(sleep_time) From cbc4580b72bb4a1613d80aa0d5f3aa6075ad414e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 17:21:21 +0000 Subject: [PATCH 05/18] Fixing #240 --- olmocr/pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 4f7d8f5..7ad2b3d 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -329,7 +329,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: async def process_pdf(args, worker_id: int, pdf_orig_path: str): - with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf: + with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf: try: data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path)) tf.write(data) @@ -347,6 +347,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): tf.write(convert_image_to_pdf_bytes(tf.name)) tf.flush() + try: try: reader = PdfReader(tf.name) num_pages = reader.get_num_pages() @@ -398,7 +399,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): # You can't build a dolma doc with even 1 failed page, so just get out of here # However, you don't want to propagate an exception higher up and cancel the entire work_group return None - + finally: + if os.path.exists(tf.name): + os.unlink(tf.name) def build_dolma_document(pdf_orig_path, page_results): # Build the document text and page spans From af7aaef60520e2e2a4241d09f839ee100cae8695 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 20:07:17 +0000 Subject: [PATCH 06/18] Run marker script --- olmocr/bench/runners/run_marker.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py index 6d15643..d444408 100644 --- a/olmocr/bench/runners/run_marker.py +++ b/olmocr/bench/runners/run_marker.py @@ -13,13 +13,11 @@ _marker_converter = None def run_marker(pdf_path: str, page_num: int = 1) -> str: global _marker_converter - google_key_exists = os.getenv("GOOGLE_API_KEY") is not None - if _marker_converter is None: # Create a configuration dictionary with the necessary settings config = { "force_ocr": True, # This enables conversion of inline math to LaTeX - "use_llm": google_key_exists, # Activate LLM mode if google key is specified + "use_llm": False, # We would prefer to run just plain marker for reporting bench results, not hybrid mode "disable_tqdm": True, # Disable tqdm for cleaner output "recognition_batch_size": 256, "layout_batch_size": 48, @@ -33,7 +31,6 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str: _marker_converter = PdfConverter( artifact_dict=create_model_dict(), config=config_parser.generate_config_dict(), - llm_service=config_parser.get_llm_service(), ) # Extract the specific page from the PDF From 9787d007b99c6413a522b9ed9c0e961951b39dc8 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:02:46 +0000 Subject: [PATCH 07/18] Pulling in bigger benchmark script from vllm branch to main --- scripts/run_benchmark.sh | 190 ++++++++++++++++++++++++++++++++------- 1 file changed, 156 insertions(+), 34 deletions(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index e7667b0..4d00b70 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -1,7 +1,39 @@ #!/bin/bash +# Runs an olmocr-bench run using the full pipeline (no fallback) +# Without model parameter (default behavior):, uses the default image from hugging face +# ./scripts/run_benchmark.sh +# With model parameter: for testing custom models +# ./scripts/run_benchmark.sh --model your-model-name + set -e +# Parse command line arguments +MODEL="" +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 [--model MODEL_NAME]" + exit 1 + ;; + esac +done + +# Check for uncommitted changes +if ! git diff-index --quiet HEAD --; then + echo "Error: There are uncommitted changes in the repository." + echo "Please commit or stash your changes before running the benchmark." + echo "" + echo "Uncommitted changes:" + git status --short + exit 1 +fi + # Use conda environment Python if available, otherwise use system Python if [ -n "$CONDA_PREFIX" ]; then PYTHON="$CONDA_PREFIX/bin/python" @@ -36,63 +68,153 @@ BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') echo "Beaker user: $BEAKER_USER" # Push image to beaker -echo "Pushing image to Beaker..." -beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG +echo "Trying to push image to Beaker..." +if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then + echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image." +fi # Create Python script to run beaker experiment cat << 'EOF' > /tmp/run_benchmark_experiment.py import sys -from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints +from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar -# Get image tag, beaker user, git branch, and git hash from command line +# Get image tag, beaker user, git branch, git hash, and optional model from command line image_tag = sys.argv[1] beaker_user = sys.argv[2] git_branch = sys.argv[3] git_hash = sys.argv[4] +model = sys.argv[5] if len(sys.argv) > 5 else None # Initialize Beaker client b = Beaker.from_env(default_workspace="ai2/olmocr") -# Create experiment spec +# Build the pipeline command with optional model parameter +pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf" +if model: + pipeline_cmd += f" --model {model}" + +# Check if AWS credentials secret exists +aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE" +try: + # Try to get the secret to see if it exists + b.secret.get(aws_creds_secret, workspace="ai2/olmocr") + has_aws_creds = True + print(f"Found AWS credentials secret: {aws_creds_secret}") +except: + has_aws_creds = False + print(f"AWS credentials secret not found: {aws_creds_secret}") + +# First experiment: Original benchmark job +commands = [] +if has_aws_creds: + commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +commands.extend([ + "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + "cd olmOCR-bench && git lfs pull && cd ..", + pipeline_cmd, + "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", + "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" +]) + +# Build task spec with optional env vars +task_spec_args = { + "name": "olmocr-benchmark", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create first experiment spec experiment_spec = ExperimentSpec( description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}", budget="ai2/oe-data", - tasks=[ - TaskSpec( - name="olmocr-benchmark", - image=ImageSource(beaker=f"{beaker_user}/{image_tag}"), - command=[ - "bash", "-c", - " && ".join([ - "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", - "cd olmOCR-bench && git lfs pull && cd ..", - "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf", - "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", - "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" - ]) - ], - context=TaskContext( - priority=Priority.normal, - preemptible=True, - ), - resources=TaskResources(gpu_count=1), - constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), - result=ResultSpec(path="/noop-results"), - ) - ], + tasks=[TaskSpec(**task_spec_args)], ) -# Create the experiment +# Create the first experiment experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") -print(f"Created experiment: {experiment.id}") +print(f"Created benchmark experiment: {experiment.id}") print(f"View at: https://beaker.org/ex/{experiment.id}") +print("-------") +print("") + +# Second experiment: Performance test job +perf_pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --markdown --pdfs s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/*.pdf" +if model: + perf_pipeline_cmd += f" --model {model}" + +perf_commands = [] +if has_aws_creds: + perf_commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +perf_commands.append(perf_pipeline_cmd) + +# Build performance task spec +perf_task_spec_args = { + "name": "olmocr-performance", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(perf_commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + perf_task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create performance experiment spec +perf_experiment_spec = ExperimentSpec( + description=f"OlmOCR Performance Test - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**perf_task_spec_args)], +) + +# Create the performance experiment +perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr") +print(f"Created performance experiment: {perf_experiment.id}") +print(f"View at: https://beaker.org/ex/{perf_experiment.id}") EOF -# Run the Python script to create the experiment -echo "Creating Beaker experiment..." -$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH +# Run the Python script to create the experiments +echo "Creating Beaker experiments..." +if [ -n "$MODEL" ]; then + echo "Using model: $MODEL" + $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL" +else + $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH +fi # Clean up temporary file rm /tmp/run_benchmark_experiment.py -echo "Benchmark experiment submitted successfully!" \ No newline at end of file +echo "Benchmark experiments submitted successfully!" \ No newline at end of file From 044874a634a5a65942d67aa182789175cc240bc9 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:12:58 +0000 Subject: [PATCH 08/18] Adding marker benchmark --- scripts/run_marker_benchmark.sh | 197 ++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 scripts/run_marker_benchmark.sh diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh new file mode 100644 index 0000000..a21d4a9 --- /dev/null +++ b/scripts/run_marker_benchmark.sh @@ -0,0 +1,197 @@ +#!/bin/bash + +# Runs marker benchmark, measuring both olmOCR-bench performance and per document processing performance +# ./scripts/run_marker_benchmark.sh +# ./scripts/run_marker_benchmark.sh 1.7.5 + +set -e + +# Parse command line arguments +MARKER_VERSION="${1:-1.7.5}" +echo "Using marker version: $MARKER_VERSION" + +# Check for uncommitted changes +if ! git diff-index --quiet HEAD --; then + echo "Error: There are uncommitted changes in the repository." + echo "Please commit or stash your changes before running the benchmark." + echo "" + echo "Uncommitted changes:" + git status --short + exit 1 +fi + +# Use conda environment Python if available, otherwise use system Python +if [ -n "$CONDA_PREFIX" ]; then + PYTHON="$CONDA_PREFIX/bin/python" + echo "Using conda Python from: $CONDA_PREFIX" +else + PYTHON="python" + echo "Warning: No conda environment detected, using system Python" +fi + +# Get version from version.py +VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)') +echo "OlmOCR version: $VERSION" + +# Get first 10 characters of git hash +GIT_HASH=$(git rev-parse HEAD | cut -c1-10) +echo "Git hash: $GIT_HASH" + +# Get current git branch name +GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) +echo "Git branch: $GIT_BRANCH" + +# Create full image tag +IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}" +echo "Building Docker image with tag: $IMAGE_TAG" + +# Build the Docker image +echo "Building Docker image..." +docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . + +# Get Beaker username +BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') +echo "Beaker user: $BEAKER_USER" + +# Push image to beaker +echo "Trying to push image to Beaker..." +if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then + echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image." +fi + +# Create Python script to run beaker experiment +cat << 'EOF' > /tmp/run_benchmark_experiment.py +import sys +from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar + +# Get image tag, beaker user, git branch, git hash, and marker version from command line +image_tag = sys.argv[1] +beaker_user = sys.argv[2] +git_branch = sys.argv[3] +git_hash = sys.argv[4] +marker_version = sys.argv[5] + +# Initialize Beaker client +b = Beaker.from_env(default_workspace="ai2/olmocr") + + +# Check if AWS credentials secret exists +aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE" +try: + # Try to get the secret to see if it exists + b.secret.get(aws_creds_secret, workspace="ai2/olmocr") + has_aws_creds = True + print(f"Found AWS credentials secret: {aws_creds_secret}") +except: + has_aws_creds = False + print(f"AWS credentials secret not found: {aws_creds_secret}") + +# First experiment: Original benchmark job +commands = [] +if has_aws_creds: + commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +commands.extend([ + "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + "cd olmOCR-bench && git lfs pull && cd ..", + f"pip install marker=={marker_version}", + "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data", + "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" +]) + +# Build task spec with optional env vars +task_spec_args = { + "name": "marker-benchmark", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create first experiment spec +experiment_spec = ExperimentSpec( + description=f"Marker {marker_version} Benchmark Run - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**task_spec_args)], +) + +# Create the first experiment +experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") +print(f"Created benchmark experiment: {experiment.id}") +print(f"View at: https://beaker.org/ex/{experiment.id}") +print("-------") +print("") + + +perf_commands = [] +if has_aws_creds: + perf_commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +perf_commands.extend([ + f"pip install marker=={marker_version}", + "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/", + "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" +]) + +# Build performance task spec +perf_task_spec_args = { + "name": "marker-performance", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(perf_commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + perf_task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create performance experiment spec +perf_experiment_spec = ExperimentSpec( + description=f"Marker {marker_version} Performance Test - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**perf_task_spec_args)], +) + +# Create the performance experiment +perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr") +print(f"Created performance experiment: {perf_experiment.id}") +print(f"View at: https://beaker.org/ex/{perf_experiment.id}") +EOF + +# Run the Python script to create the experiments +echo "Creating Beaker experiments..." +$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH $MARKER_VERSION + +# Clean up temporary file +rm /tmp/run_benchmark_experiment.py + +echo "Benchmark experiments submitted successfully!" \ No newline at end of file From f8dfd857652017d3eee75d6e22936989b4599d93 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:13:31 +0000 Subject: [PATCH 09/18] Script --- scripts/run_marker_benchmark.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/run_marker_benchmark.sh diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh old mode 100644 new mode 100755 From 548187902b7fbde45553266dbedd52703e4c0304 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:14:00 +0000 Subject: [PATCH 10/18] Ignore --- .gitignore | 1 + olmocr/pipeline.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2a1c30a..cf93ea3 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ olmOCR-bench/* table_data*/ /synth*/ dolma_samples/* +old_train/ /*.html scoreelo.csv debug.log diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 7ad2b3d..e869316 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -403,6 +403,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): if os.path.exists(tf.name): os.unlink(tf.name) + def build_dolma_document(pdf_orig_path, page_results): # Build the document text and page spans document_text = "" @@ -711,7 +712,7 @@ async def sglang_server_ready(): raise Exception("sglang server did not become ready after waiting.") -async def download_model(model_name_or_path: str, max_retries: int=5): +async def download_model(model_name_or_path: str, max_retries: int = 5): for retry in range(max_retries): try: if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"): @@ -731,7 +732,7 @@ async def download_model(model_name_or_path: str, max_retries: int=5): return model_name_or_path except Exception: if retry == max_retries - 1: - raise # Raise on final attempt and fail the job + raise # Raise on final attempt and fail the job sleep_time = random.randrange(2, 20) * 2**retry logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})") From 4bfcfce7672ba2a2fb973574b66a4a67048abb04 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:18:58 +0000 Subject: [PATCH 11/18] Actually install the right thing --- scripts/run_marker_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index a21d4a9..9c559d3 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -96,7 +96,7 @@ if has_aws_creds: commands.extend([ "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", "cd olmOCR-bench && git lfs pull && cd ..", - f"pip install marker=={marker_version}", + f"pip install marker-pdf=={marker_version}", "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ]) @@ -146,7 +146,7 @@ if has_aws_creds: 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' ]) perf_commands.extend([ - f"pip install marker=={marker_version}", + f"pip install marker-pdf=={marker_version}", "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/", "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" ]) From 0f3b45c1a38f0cfec596d31610f6df377a77cae5 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:19:17 +0000 Subject: [PATCH 12/18] Add time --- scripts/run_marker_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index 9c559d3..e8178a5 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -148,7 +148,7 @@ if has_aws_creds: perf_commands.extend([ f"pip install marker-pdf=={marker_version}", "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/", - "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" + "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" ]) # Build performance task spec From 59e0a1ccb0bae5efa45e23d295ee5fa477743958 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:23:53 +0000 Subject: [PATCH 13/18] Marker wants newer torchvision --- scripts/run_marker_benchmark.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index e8178a5..3a84b0e 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -97,6 +97,7 @@ commands.extend([ "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", "cd olmOCR-bench && git lfs pull && cd ..", f"pip install marker-pdf=={marker_version}", + "pip install --upgrade torchvision", "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ]) @@ -147,6 +148,7 @@ if has_aws_creds: ]) perf_commands.extend([ f"pip install marker-pdf=={marker_version}", + "pip install --upgrade torchvision", "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/", "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" ]) From fc06797bec8c1a238e8f0fe81e3530f3dbe6c9f3 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:29:39 +0000 Subject: [PATCH 14/18] aws cli --- scripts/run_marker_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index 3a84b0e..ad56e66 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -149,7 +149,7 @@ if has_aws_creds: perf_commands.extend([ f"pip install marker-pdf=={marker_version}", "pip install --upgrade torchvision", - "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/", + "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/", "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" ]) From fcd8bbec92182caea6bc434b3471fe31e130ed63 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 21:38:28 +0000 Subject: [PATCH 15/18] Install aws cli --- scripts/run_marker_benchmark.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index ad56e66..05d1179 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -149,6 +149,7 @@ if has_aws_creds: perf_commands.extend([ f"pip install marker-pdf=={marker_version}", "pip install --upgrade torchvision", + "pip install awscli", "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/", "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" ]) From 3da6e2d58799a1eb2b24ed1fba24d2becb4fe811 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 22:23:41 +0000 Subject: [PATCH 16/18] Pareto plot update, keep cost the same for now --- scripts/pareto_plot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py index 9c4bbc7..7c04a70 100644 --- a/scripts/pareto_plot.py +++ b/scripts/pareto_plot.py @@ -64,12 +64,12 @@ data = { "MinerU", "Gemini Flash 2", "Gemini Flash 2 (Batch)", - "Marker v1.7.4", + "Marker v1.7.5", "Ours", "Qwen 2 VL", "Qwen 2.5 VL", ], - COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 75, 178, 178, 178], # Same cost as Ours # Same cost as Ours + COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178], # Same cost as Ours # Same cost as Ours PERF_COLUMN_NAME: [ 69.9, # GPT-4o (Anchored) 69.9, # Same performance for batch @@ -77,7 +77,7 @@ data = { 61.5, # MinerU 63.8, # Gemini Flash 2 (Anchored) 63.8, # Same performance for batch - 70.0, # marker v1.7.4 base + 70.1, # marker v1.7.5 base 77.4, # Ours (performance is the same across hardware) 31.5, # Qwen2VL 65.5, # Qwen2.5VL @@ -94,7 +94,7 @@ model_categories = { "MinerU": "Open Source Tool", "Gemini Flash 2": "Commercial VLM", "Gemini Flash 2 (Batch)": "Commercial VLM", - "Marker v1.7.4": "Open Source Tool", + "Marker v1.7.5": "Open Source Tool", "Ours": "Ours", "Qwen 2 VL": "Open VLM", "Qwen 2.5 VL": "Open VLM", @@ -132,7 +132,7 @@ model_label_offsets = { "MinerU": [-15, -20], "Gemini Flash 2": [-10, 10], "Gemini Flash 2 (Batch)": [-50, -15], - "Marker v1.7.4": [-35, -20], + "Marker v1.7.5": [-20, 15], "Ours": [-20, 10], "Qwen 2 VL": [-35, 10], "Qwen 2.5 VL": [-35, 10], From f273de6e6ec36c1c75eed6ec9935bc3df09987f2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 15:32:09 -0700 Subject: [PATCH 17/18] Update README.md Updating to v.1.7.5 marker that I ran locally with base only for now --- olmocr/bench/README.md | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/olmocr/bench/README.md b/olmocr/bench/README.md index 8ab5f26..65c29bb 100644 --- a/olmocr/bench/README.md +++ b/olmocr/bench/README.md @@ -37,7 +37,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o GOT OCR 52.7 52.0 - 0.2 + 0.20 22.1 93.6 42.0 @@ -46,28 +46,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o 48.3 ± 1.1 - Marker v1.7.4 (base) - 77.7 - 59.6 + Marker v1.7.5 (base) + 76.0 57.9 + 57.6 27.8 - 85.3 - 73.5 - 78.7 + 84.9 + 72.9 + 84.6 99.1 - 70.0 ± 1.1 - - - Marker v1.7.4 (hybrid) - 77.7 - 71.2 - 78.1 - 32.3 - 83.4 - 73.8 - 79.0 - 99.2 - 74.3 ± 1.1 + 70.1 ± 1.1 MinerU v1.3.10 @@ -83,7 +71,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o Mistral OCR API - 77.2 + 77.2 67.5 60.6 29.3 @@ -169,7 +157,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o olmOCR v0.1.68 (No Anchor) 72.1 74.7 - 71.5 + 71.5 43.7 91.6 78.5 @@ -300,6 +288,3 @@ We have an internal data annotation tool that can be used to review the question ```bash python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force ``` - - - From 37090e2801e489c57b0f445acde3ce795352344f Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 12 Jun 2025 22:35:08 +0000 Subject: [PATCH 18/18] Go back to workers 1 in marker test script --- scripts/run_marker_benchmark.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh index 05d1179..332a3f0 100755 --- a/scripts/run_marker_benchmark.sh +++ b/scripts/run_marker_benchmark.sh @@ -151,7 +151,9 @@ perf_commands.extend([ "pip install --upgrade torchvision", "pip install awscli", "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/", - "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" + # Tried with workers 8, but it was taking a really huge amount of time + #"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" + "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker" ]) # Build performance task spec