diff --git a/gantry-requirements.txt b/gantry-requirements.txt deleted file mode 100644 index 6128533..0000000 --- a/gantry-requirements.txt +++ /dev/null @@ -1,35 +0,0 @@ -torchvision -cached-path -smart_open -pypdf -pypdfium2 -lingua-language-detector -Pillow -ruff -mypy>=1.0,<1.5 -black>=23.0,<24.0 -isort>=5.12,<5.13 -pytest -pytest-sphinx -pytest-cov -twine>=1.11.0 -build -setuptools -wheel -Sphinx>=4.3.0,<7.1.0 -furo==2023.7.26 -myst-parser>=1.0,<2.1 -sphinx-copybutton==0.5.2 -sphinx-autobuild==2021.3.14 -sphinx-autodoc-typehints==1.23.3 -packaging -necessary -accelerate>=0.34.2 -datasets==3.0.0 -peft -wandb -omegaconf -s3fs -transformers>=4.45.1 -bitsandbytes -ftfy diff --git a/gantry-train-requirements.txt b/gantry-train-requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/scripts/beaker/Dockerfile-inference b/scripts/beaker/Dockerfile-inference deleted file mode 100644 index c455001..0000000 --- a/scripts/beaker/Dockerfile-inference +++ /dev/null @@ -1,49 +0,0 @@ -FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 - -RUN apt-get update -y && apt-get install -y software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get -y update - -# Install requirements specific to pdfs -RUN apt-get update && apt-get -y install python3-apt -RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections -RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools - -RUN apt-get update -y && apt-get install -y --no-install-recommends \ - git \ - python3.11 \ - python3.11-dev \ - python3.11-distutils \ - ca-certificates \ - build-essential \ - curl \ - unzip - -RUN rm -rf /var/lib/apt/lists/* \ - && unlink /usr/bin/python3 \ - && ln -s /usr/bin/python3.11 /usr/bin/python3 \ - && ln -s /usr/bin/python3 /usr/bin/python \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ - && pip3 install -U pip - -RUN apt-get update && apt-get -y install python3.11-venv -ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh -RUN /install.sh && rm /install.sh - -ENV PYTHONUNBUFFERED=1 -WORKDIR /root -COPY pyproject.toml pyproject.toml -COPY olmocr/version.py olmocr/version.py - -RUN /root/.local/bin/uv pip install --system --no-cache -e . - -RUN /root/.local/bin/uv pip install --system --no-cache sgl-kernel==0.0.3.post1 --force-reinstall --no-deps -RUN /root/.local/bin/uv pip install --system --no-cache "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ - -COPY olmocr olmocr - -WORKDIR /root -COPY olmocr olmocr - -RUN python3 -m sglang.launch_server --help -RUN python3 -m olmocr.pipeline --help \ No newline at end of file diff --git a/scripts/beaker/Dockerfile-tagging b/scripts/beaker/Dockerfile-tagging deleted file mode 100644 index 16c2381..0000000 --- a/scripts/beaker/Dockerfile-tagging +++ /dev/null @@ -1,49 +0,0 @@ -FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 - -RUN apt-get update -y && apt-get install -y software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get -y update - -# Install requirements specific to pdfs -RUN apt-get update && apt-get -y install python3-apt -RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections -RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools - -RUN apt-get update -y && apt-get install -y --no-install-recommends \ - git \ - python3.11 \ - python3.11-dev \ - python3.11-distutils \ - ca-certificates \ - build-essential \ - curl \ - unzip - -RUN rm -rf /var/lib/apt/lists/* \ - && unlink /usr/bin/python3 \ - && ln -s /usr/bin/python3.11 /usr/bin/python3 \ - && ln -s /usr/bin/python3 /usr/bin/python \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ - && pip3 install -U pip - -RUN apt-get update && apt-get -y install python3.11-venv -ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh -RUN /install.sh && rm /install.sh - -ENV PYTHONUNBUFFERED=1 -WORKDIR /root -COPY pyproject.toml pyproject.toml -COPY olmocr/version.py olmocr/version.py - -RUN /root/.local/bin/uv pip install --system --no-cache -e . - -RUN /root/.local/bin/uv pip install --system --no-cache vllm==0.8.2 - - -WORKDIR /root -COPY olmocr olmocr -COPY scripts scripts - -RUN vllm --help -RUN python3 -m olmocr.pipeline --help -RUN python scripts/tagging_pipeline.py --help \ No newline at end of file diff --git a/scripts/beaker/Dockerfile-train b/scripts/beaker/Dockerfile-train deleted file mode 100644 index ba6dd11..0000000 --- a/scripts/beaker/Dockerfile-train +++ /dev/null @@ -1,26 +0,0 @@ -FROM gcr.io/ai2-beaker-core/public/cqgl31u2ba5vrtuc91og:latest - -# Update the package list and install libaio-dev and gnupg2 -RUN apt update && apt-get install -y libaio-dev gnupg2 - -# Add NVIDIA package repository keys -RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \ - && apt-get -y update - -# Set up the NVIDIA CUDA repository -RUN apt-get install -y software-properties-common \ - && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" \ - && apt-get update - -# Install CUDA toolkit and nvcc 12.1 -RUN apt-get install -y cuda-nvcc-12-1 - -# Get flash attention setup -RUN pip install flash-attn --no-build-isolation - -# Install PDF utilities -RUN apt-get install -y poppler-utils -RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections -RUN apt-get install -y ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools - diff --git a/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml b/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml deleted file mode 100644 index 17a7aa6..0000000 --- a/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: - # full fine tune - name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/ - #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/ - vlm: true - - # necessary to prevent random crashes, until vllm fixes some bugs - num_scheduler_steps: 1 - -format: - add_generation_prompt: true - -generate: - # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself - max_context_length: 6500 - temperature: 0.8 - top_p: 1.0 - drop_long_outputs: false - - -pipeline: - sqs_queue_name: jake-pdf - num_workers: 3 - generation_batch_size: 256 - tokenization_batch_size: 64 - output_serializer: default - target_bucket: ai2-oe-data - target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs - allowed_restarts_per_predictor: 10 - -task: - budget: ai2/oe-data - workspace: ai2/oe-data-model-based-cleanup - name: qwen2vl-schedsteps-bg - replicas: 128 - priority: LOW - gpu_count: 1 - cluster: - - ai2/jupiter-cirrascale-2 - - ai2/saturn-cirrascale - diff --git a/scripts/train/newtrainer-gantry.sh b/scripts/train/newtrainer-gantry.sh new file mode 100755 index 0000000..71211d4 --- /dev/null +++ b/scripts/train/newtrainer-gantry.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -e + +# Use conda environment Python if available, otherwise use system Python +if [ -n "$CONDA_PREFIX" ]; then + PYTHON="$CONDA_PREFIX/bin/python" + echo "Using conda Python from: $CONDA_PREFIX" +else + PYTHON="python" + echo "Warning: No conda environment detected, using system Python" +fi + +# Get version from version.py +VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)') +echo "OlmOCR version: $VERSION" + +# Get first 10 characters of git hash +GIT_HASH=$(git rev-parse HEAD | cut -c1-10) +echo "Git hash: $GIT_HASH" + +# Get current git branch name +GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) +echo "Git branch: $GIT_BRANCH" + +# Create full image tag +IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}" +echo "Building Docker image with tag: $IMAGE_TAG" + +# Build the Docker image +echo "Building Docker image..." +docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . + +# Get Beaker username +BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') +echo "Beaker user: $BEAKER_USER" + +# Push image to beaker +echo "Trying to push image to Beaker..." +if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then + echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image." +fi + +gantry run \ + --description "${run_name}"\ + --task-name "${run_name}"\ + --allow-dirty \ + --host-networking \ + --workspace ai2/olmocr \ + --beaker-image $IMAGE_TAG \ + --pip gantry-train-requirements.txt \ + --priority normal \ + --gpus 8 \ + --preemptible \ + --cluster "ai2/jupiter-cirrascale-2" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + -- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml" \ No newline at end of file diff --git a/scripts/qwen25vl-7b-gantry.sh b/scripts/train/qwen25vl-7b-gantry.sh similarity index 100% rename from scripts/qwen25vl-7b-gantry.sh rename to scripts/train/qwen25vl-7b-gantry.sh diff --git a/scripts/qwen2vl-2b-gantry.sh b/scripts/train/qwen2vl-2b-gantry.sh similarity index 100% rename from scripts/qwen2vl-2b-gantry.sh rename to scripts/train/qwen2vl-2b-gantry.sh diff --git a/scripts/qwen2vl-7b-gantry.sh b/scripts/train/qwen2vl-7b-gantry.sh similarity index 100% rename from scripts/qwen2vl-7b-gantry.sh rename to scripts/train/qwen2vl-7b-gantry.sh diff --git a/scripts/qwen2vl-7b-lora-gantry.sh b/scripts/train/qwen2vl-7b-lora-gantry.sh similarity index 100% rename from scripts/qwen2vl-7b-lora-gantry.sh rename to scripts/train/qwen2vl-7b-lora-gantry.sh