diff --git a/.gitignore b/.gitignore index 2a1c30a..cf93ea3 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ olmOCR-bench/* table_data*/ /synth*/ dolma_samples/* +old_train/ /*.html scoreelo.csv debug.log diff --git a/README.md b/README.md index 6e95c54..ee0040b 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,6 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro - - Marker v1.6.2 - 24.3 - 22.1 - 69.8 - 24.3 - 87.1 - 71.0 - 76.9 - 99.5 - 59.4 ± 1.1 - MinerU v1.3.10 75.4 @@ -87,7 +75,7 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro Mistral OCR API - 77.2 + 77.2 67.5 60.6 29.3 @@ -97,6 +85,18 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro 99.4 72.0 ± 1.1 + + Marker v1.7.4 (hybrid) + 77.7 + 71.2 + 78.1 + 32.3 + 83.4 + 73.8 + 79.0 + 99.2 + 74.3 ± 1.1 + olmOCR v0.1.68 (pipeline.py) 75.6 diff --git a/olmocr/bench/README.md b/olmocr/bench/README.md index 8cd0f72..65c29bb 100644 --- a/olmocr/bench/README.md +++ b/olmocr/bench/README.md @@ -37,7 +37,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o GOT OCR 52.7 52.0 - 0.2 + 0.20 22.1 93.6 42.0 @@ -46,16 +46,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o 48.3 ± 1.1 - Marker v1.6.2 - 24.3 - 22.1 - 69.8 - 24.3 - 87.1 - 71.0 - 76.9 - 99.5 - 59.4 ± 1.1 + Marker v1.7.5 (base) + 76.0 + 57.9 + 57.6 + 27.8 + 84.9 + 72.9 + 84.6 + 99.1 + 70.1 ± 1.1 MinerU v1.3.10 @@ -78,7 +78,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o 93.6 71.3 77.1 - 99.4 + 99.4 72.0 ± 1.1 @@ -121,7 +121,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o Gemini Flash 2 (Anchored) 54.5 56.1 - 72.1 + 72.1 34.2 64.7 61.5 @@ -157,7 +157,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o olmOCR v0.1.68 (No Anchor) 72.1 74.7 - 71.5 + 71.5 43.7 91.6 78.5 @@ -288,6 +288,3 @@ We have an internal data annotation tool that can be used to review the question ```bash python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force ``` - - - diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py index 58733cd..d444408 100644 --- a/olmocr/bench/runners/run_marker.py +++ b/olmocr/bench/runners/run_marker.py @@ -4,6 +4,7 @@ import tempfile from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered +from marker.config.parser import ConfigParser from pypdf import PdfReader, PdfWriter _marker_converter = None @@ -15,10 +16,22 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str: if _marker_converter is None: # Create a configuration dictionary with the necessary settings config = { - "texify_inline_spans": True, # This enables conversion of inline math to LaTeX + "force_ocr": True, # This enables conversion of inline math to LaTeX + "use_llm": False, # We would prefer to run just plain marker for reporting bench results, not hybrid mode + "disable_tqdm": True, # Disable tqdm for cleaner output + "recognition_batch_size": 256, + "layout_batch_size": 48, + "detection_batch_size": 48, + "equation_batch_size": 64, + "table_rec_batch_size": 48, + "ocr_error_batch_size": 64, } + config_parser = ConfigParser(config) - _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config) + _marker_converter = PdfConverter( + artifact_dict=create_model_dict(), + config=config_parser.generate_config_dict(), + ) # Extract the specific page from the PDF pdf_to_process = pdf_path diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index ec87313..320d31a 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -123,6 +123,8 @@ def normalize_text(md_content: str) -> str: # Remove markdown bold formatting (** or __ for bold) md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content) md_content = re.sub(r"__(.*?)__", r"\1", md_content) + md_content = re.sub(r"", "", md_content) # Remove tags if they exist + md_content = re.sub(r"", "", md_content) # Remove tags if they exist # Remove markdown italics formatting (* or _ for italics) md_content = re.sub(r"\*(.*?)\*", r"\1", md_content) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 0899d77..410389f 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -329,7 +329,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: async def process_pdf(args, worker_id: int, pdf_orig_path: str): - with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf: + with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf: try: data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path)) tf.write(data) @@ -347,6 +347,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): tf.write(convert_image_to_pdf_bytes(tf.name)) tf.flush() + try: try: reader = PdfReader(tf.name) num_pages = reader.get_num_pages() @@ -398,6 +399,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): # You can't build a dolma doc with even 1 failed page, so just get out of here # However, you don't want to propagate an exception higher up and cancel the entire work_group return None + finally: + if os.path.exists(tf.name): + os.unlink(tf.name) def build_dolma_document(pdf_orig_path, page_results): @@ -698,19 +702,31 @@ async def vllm_server_ready(): raise Exception("vllm server did not become ready after waiting.") -async def download_model(model_name_or_path: str): - if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"): - logger.info(f"Downloading model directory from '{model_name_or_path}'") - model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model") - download_directory([model_name_or_path], model_cache_dir) - return model_cache_dir - elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path): - logger.info(f"Using local model path at '{model_name_or_path}'") - return model_name_or_path - else: - logger.info(f"Downloading model with hugging face '{model_name_or_path}'") - snapshot_download(repo_id=model_name_or_path) - return model_name_or_path +async def download_model(model_name_or_path: str, max_retries: int = 5): + for retry in range(max_retries): + try: + if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"): + logger.info(f"Downloading model directory from '{model_name_or_path}'") + model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model") + # Delete existing model cache directory if it exists + if os.path.exists(model_cache_dir): + shutil.rmtree(model_cache_dir) + download_directory([model_name_or_path], model_cache_dir) + return model_cache_dir + elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path): + logger.info(f"Using local model path at '{model_name_or_path}'") + return model_name_or_path + else: + logger.info(f"Downloading model with hugging face '{model_name_or_path}'") + snapshot_download(repo_id=model_name_or_path) + return model_name_or_path + except Exception: + if retry == max_retries - 1: + raise # Raise on final attempt and fail the job + + sleep_time = random.randrange(2, 20) * 2**retry + logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})") + await asyncio.sleep(random.randrange(10, 30) * 2**retry) async def metrics_reporter(work_queue): @@ -899,6 +915,7 @@ def print_stats(args, root_work_queue): logger.warning(f"Error processing {s3_path}: {e}") return 0, 0, 0, 0, 0, set(), 0, 0 + print(f"\nCompleted work items {completed_items:,} out of {total_items:,}: {completed_items/total_items*100:.2f}%") print("\nProcessing output files...") docs_total = 0 input_tokens_total = 0 @@ -1026,8 +1043,8 @@ async def main(): # Wait a little bit so that not all beaker jobs in a task start at the same time and download the model at the same time replica_count = int(os.environ.get("BEAKER_REPLICA_COUNT", "1")) - interval = 10 if (replica_count - 1) * 10 <= 240 else 240 / max(1, replica_count - 1) - sleep_time = int(int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval) + interval = 10 if (replica_count - 1) * 10 <= 30 else 30 / max(1, replica_count - 1) + sleep_time = int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval logger.info(f"Beaker job sleeping for {sleep_time} seconds to stagger model downloads") await asyncio.sleep(sleep_time) diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py index d3806df..7c04a70 100644 --- a/scripts/pareto_plot.py +++ b/scripts/pareto_plot.py @@ -64,7 +64,7 @@ data = { "MinerU", "Gemini Flash 2", "Gemini Flash 2 (Batch)", - "Marker v1.6.2", + "Marker v1.7.5", "Ours", "Qwen 2 VL", "Qwen 2.5 VL", @@ -77,7 +77,7 @@ data = { 61.5, # MinerU 63.8, # Gemini Flash 2 (Anchored) 63.8, # Same performance for batch - 59.4, # marker v1.6.2 + 70.1, # marker v1.7.5 base 77.4, # Ours (performance is the same across hardware) 31.5, # Qwen2VL 65.5, # Qwen2.5VL @@ -94,7 +94,7 @@ model_categories = { "MinerU": "Open Source Tool", "Gemini Flash 2": "Commercial VLM", "Gemini Flash 2 (Batch)": "Commercial VLM", - "Marker v1.6.2": "Open Source Tool", + "Marker v1.7.5": "Open Source Tool", "Ours": "Ours", "Qwen 2 VL": "Open VLM", "Qwen 2.5 VL": "Open VLM", @@ -132,7 +132,7 @@ model_label_offsets = { "MinerU": [-15, -20], "Gemini Flash 2": [-10, 10], "Gemini Flash 2 (Batch)": [-50, -15], - "Marker v1.6.2": [-35, -20], + "Marker v1.7.5": [-20, 15], "Ours": [-20, 10], "Qwen 2 VL": [-35, 10], "Qwen 2.5 VL": [-35, 10], diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index b6b0526..4d00b70 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -104,7 +104,7 @@ except: has_aws_creds = False print(f"AWS credentials secret not found: {aws_creds_secret}") -# Build commands list +# First experiment: Original benchmark job commands = [] if has_aws_creds: commands.extend([ @@ -142,21 +142,71 @@ if has_aws_creds: EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) ] -# Create experiment spec +# Create first experiment spec experiment_spec = ExperimentSpec( description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}", budget="ai2/oe-data", tasks=[TaskSpec(**task_spec_args)], ) -# Create the experiment +# Create the first experiment experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") -print(f"Created experiment: {experiment.id}") +print(f"Created benchmark experiment: {experiment.id}") print(f"View at: https://beaker.org/ex/{experiment.id}") +print("-------") +print("") + +# Second experiment: Performance test job +perf_pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --markdown --pdfs s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/*.pdf" +if model: + perf_pipeline_cmd += f" --model {model}" + +perf_commands = [] +if has_aws_creds: + perf_commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +perf_commands.append(perf_pipeline_cmd) + +# Build performance task spec +perf_task_spec_args = { + "name": "olmocr-performance", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(perf_commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + perf_task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create performance experiment spec +perf_experiment_spec = ExperimentSpec( + description=f"OlmOCR Performance Test - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**perf_task_spec_args)], +) + +# Create the performance experiment +perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr") +print(f"Created performance experiment: {perf_experiment.id}") +print(f"View at: https://beaker.org/ex/{perf_experiment.id}") EOF -# Run the Python script to create the experiment -echo "Creating Beaker experiment..." +# Run the Python script to create the experiments +echo "Creating Beaker experiments..." if [ -n "$MODEL" ]; then echo "Using model: $MODEL" $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL" @@ -167,4 +217,4 @@ fi # Clean up temporary file rm /tmp/run_benchmark_experiment.py -echo "Benchmark experiment submitted successfully!" \ No newline at end of file +echo "Benchmark experiments submitted successfully!" \ No newline at end of file diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh new file mode 100755 index 0000000..332a3f0 --- /dev/null +++ b/scripts/run_marker_benchmark.sh @@ -0,0 +1,202 @@ +#!/bin/bash + +# Runs marker benchmark, measuring both olmOCR-bench performance and per document processing performance +# ./scripts/run_marker_benchmark.sh +# ./scripts/run_marker_benchmark.sh 1.7.5 + +set -e + +# Parse command line arguments +MARKER_VERSION="${1:-1.7.5}" +echo "Using marker version: $MARKER_VERSION" + +# Check for uncommitted changes +if ! git diff-index --quiet HEAD --; then + echo "Error: There are uncommitted changes in the repository." + echo "Please commit or stash your changes before running the benchmark." + echo "" + echo "Uncommitted changes:" + git status --short + exit 1 +fi + +# Use conda environment Python if available, otherwise use system Python +if [ -n "$CONDA_PREFIX" ]; then + PYTHON="$CONDA_PREFIX/bin/python" + echo "Using conda Python from: $CONDA_PREFIX" +else + PYTHON="python" + echo "Warning: No conda environment detected, using system Python" +fi + +# Get version from version.py +VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)') +echo "OlmOCR version: $VERSION" + +# Get first 10 characters of git hash +GIT_HASH=$(git rev-parse HEAD | cut -c1-10) +echo "Git hash: $GIT_HASH" + +# Get current git branch name +GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) +echo "Git branch: $GIT_BRANCH" + +# Create full image tag +IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}" +echo "Building Docker image with tag: $IMAGE_TAG" + +# Build the Docker image +echo "Building Docker image..." +docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . + +# Get Beaker username +BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') +echo "Beaker user: $BEAKER_USER" + +# Push image to beaker +echo "Trying to push image to Beaker..." +if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then + echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image." +fi + +# Create Python script to run beaker experiment +cat << 'EOF' > /tmp/run_benchmark_experiment.py +import sys +from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar + +# Get image tag, beaker user, git branch, git hash, and marker version from command line +image_tag = sys.argv[1] +beaker_user = sys.argv[2] +git_branch = sys.argv[3] +git_hash = sys.argv[4] +marker_version = sys.argv[5] + +# Initialize Beaker client +b = Beaker.from_env(default_workspace="ai2/olmocr") + + +# Check if AWS credentials secret exists +aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE" +try: + # Try to get the secret to see if it exists + b.secret.get(aws_creds_secret, workspace="ai2/olmocr") + has_aws_creds = True + print(f"Found AWS credentials secret: {aws_creds_secret}") +except: + has_aws_creds = False + print(f"AWS credentials secret not found: {aws_creds_secret}") + +# First experiment: Original benchmark job +commands = [] +if has_aws_creds: + commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +commands.extend([ + "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + "cd olmOCR-bench && git lfs pull && cd ..", + f"pip install marker-pdf=={marker_version}", + "pip install --upgrade torchvision", + "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data", + "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" +]) + +# Build task spec with optional env vars +task_spec_args = { + "name": "marker-benchmark", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create first experiment spec +experiment_spec = ExperimentSpec( + description=f"Marker {marker_version} Benchmark Run - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**task_spec_args)], +) + +# Create the first experiment +experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") +print(f"Created benchmark experiment: {experiment.id}") +print(f"View at: https://beaker.org/ex/{experiment.id}") +print("-------") +print("") + + +perf_commands = [] +if has_aws_creds: + perf_commands.extend([ + "mkdir -p ~/.aws", + 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' + ]) +perf_commands.extend([ + f"pip install marker-pdf=={marker_version}", + "pip install --upgrade torchvision", + "pip install awscli", + "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/", + # Tried with workers 8, but it was taking a really huge amount of time + #"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8" + "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker" +]) + +# Build performance task spec +perf_task_spec_args = { + "name": "marker-performance", + "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"), + "command": [ + "bash", "-c", + " && ".join(perf_commands) + ], + "context": TaskContext( + priority=Priority.normal, + preemptible=True, + ), + "resources": TaskResources(gpu_count=1), + "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "result": ResultSpec(path="/noop-results"), +} + +# Add env vars if AWS credentials exist +if has_aws_creds: + perf_task_spec_args["env_vars"] = [ + EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret) + ] + +# Create performance experiment spec +perf_experiment_spec = ExperimentSpec( + description=f"Marker {marker_version} Performance Test - Branch: {git_branch}, Commit: {git_hash}", + budget="ai2/oe-data", + tasks=[TaskSpec(**perf_task_spec_args)], +) + +# Create the performance experiment +perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr") +print(f"Created performance experiment: {perf_experiment.id}") +print(f"View at: https://beaker.org/ex/{perf_experiment.id}") +EOF + +# Run the Python script to create the experiments +echo "Creating Beaker experiments..." +$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH $MARKER_VERSION + +# Clean up temporary file +rm /tmp/run_benchmark_experiment.py + +echo "Benchmark experiments submitted successfully!" \ No newline at end of file