mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 11:04:25 +00:00
New trainer launch script cleanups
This commit is contained in:
parent
91e7b5ce3f
commit
d7e5037192
@ -1,35 +0,0 @@
|
||||
torchvision
|
||||
cached-path
|
||||
smart_open
|
||||
pypdf
|
||||
pypdfium2
|
||||
lingua-language-detector
|
||||
Pillow
|
||||
ruff
|
||||
mypy>=1.0,<1.5
|
||||
black>=23.0,<24.0
|
||||
isort>=5.12,<5.13
|
||||
pytest
|
||||
pytest-sphinx
|
||||
pytest-cov
|
||||
twine>=1.11.0
|
||||
build
|
||||
setuptools
|
||||
wheel
|
||||
Sphinx>=4.3.0,<7.1.0
|
||||
furo==2023.7.26
|
||||
myst-parser>=1.0,<2.1
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinx-autobuild==2021.3.14
|
||||
sphinx-autodoc-typehints==1.23.3
|
||||
packaging
|
||||
necessary
|
||||
accelerate>=0.34.2
|
||||
datasets==3.0.0
|
||||
peft
|
||||
wandb
|
||||
omegaconf
|
||||
s3fs
|
||||
transformers>=4.45.1
|
||||
bitsandbytes
|
||||
ftfy
|
||||
0
gantry-train-requirements.txt
Normal file
0
gantry-train-requirements.txt
Normal file
@ -1,49 +0,0 @@
|
||||
FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
|
||||
|
||||
RUN apt-get update -y && apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get -y update
|
||||
|
||||
# Install requirements specific to pdfs
|
||||
RUN apt-get update && apt-get -y install python3-apt
|
||||
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
|
||||
RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
|
||||
|
||||
RUN apt-get update -y && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
python3.11 \
|
||||
python3.11-dev \
|
||||
python3.11-distutils \
|
||||
ca-certificates \
|
||||
build-essential \
|
||||
curl \
|
||||
unzip
|
||||
|
||||
RUN rm -rf /var/lib/apt/lists/* \
|
||||
&& unlink /usr/bin/python3 \
|
||||
&& ln -s /usr/bin/python3.11 /usr/bin/python3 \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python \
|
||||
&& pip3 install -U pip
|
||||
|
||||
RUN apt-get update && apt-get -y install python3.11-venv
|
||||
ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
|
||||
RUN /install.sh && rm /install.sh
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /root
|
||||
COPY pyproject.toml pyproject.toml
|
||||
COPY olmocr/version.py olmocr/version.py
|
||||
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache -e .
|
||||
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache sgl-kernel==0.0.3.post1 --force-reinstall --no-deps
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
|
||||
|
||||
COPY olmocr olmocr
|
||||
|
||||
WORKDIR /root
|
||||
COPY olmocr olmocr
|
||||
|
||||
RUN python3 -m sglang.launch_server --help
|
||||
RUN python3 -m olmocr.pipeline --help
|
||||
@ -1,49 +0,0 @@
|
||||
FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
|
||||
|
||||
RUN apt-get update -y && apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get -y update
|
||||
|
||||
# Install requirements specific to pdfs
|
||||
RUN apt-get update && apt-get -y install python3-apt
|
||||
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
|
||||
RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
|
||||
|
||||
RUN apt-get update -y && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
python3.11 \
|
||||
python3.11-dev \
|
||||
python3.11-distutils \
|
||||
ca-certificates \
|
||||
build-essential \
|
||||
curl \
|
||||
unzip
|
||||
|
||||
RUN rm -rf /var/lib/apt/lists/* \
|
||||
&& unlink /usr/bin/python3 \
|
||||
&& ln -s /usr/bin/python3.11 /usr/bin/python3 \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python \
|
||||
&& pip3 install -U pip
|
||||
|
||||
RUN apt-get update && apt-get -y install python3.11-venv
|
||||
ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
|
||||
RUN /install.sh && rm /install.sh
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /root
|
||||
COPY pyproject.toml pyproject.toml
|
||||
COPY olmocr/version.py olmocr/version.py
|
||||
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache -e .
|
||||
|
||||
RUN /root/.local/bin/uv pip install --system --no-cache vllm==0.8.2
|
||||
|
||||
|
||||
WORKDIR /root
|
||||
COPY olmocr olmocr
|
||||
COPY scripts scripts
|
||||
|
||||
RUN vllm --help
|
||||
RUN python3 -m olmocr.pipeline --help
|
||||
RUN python scripts/tagging_pipeline.py --help
|
||||
@ -1,26 +0,0 @@
|
||||
FROM gcr.io/ai2-beaker-core/public/cqgl31u2ba5vrtuc91og:latest
|
||||
|
||||
# Update the package list and install libaio-dev and gnupg2
|
||||
RUN apt update && apt-get install -y libaio-dev gnupg2
|
||||
|
||||
# Add NVIDIA package repository keys
|
||||
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
|
||||
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \
|
||||
&& apt-get -y update
|
||||
|
||||
# Set up the NVIDIA CUDA repository
|
||||
RUN apt-get install -y software-properties-common \
|
||||
&& add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" \
|
||||
&& apt-get update
|
||||
|
||||
# Install CUDA toolkit and nvcc 12.1
|
||||
RUN apt-get install -y cuda-nvcc-12-1
|
||||
|
||||
# Get flash attention setup
|
||||
RUN pip install flash-attn --no-build-isolation
|
||||
|
||||
# Install PDF utilities
|
||||
RUN apt-get install -y poppler-utils
|
||||
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
|
||||
RUN apt-get install -y ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
model:
|
||||
# full fine tune
|
||||
name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
|
||||
#name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
|
||||
vlm: true
|
||||
|
||||
# necessary to prevent random crashes, until vllm fixes some bugs
|
||||
num_scheduler_steps: 1
|
||||
|
||||
format:
|
||||
add_generation_prompt: true
|
||||
|
||||
generate:
|
||||
# The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
|
||||
max_context_length: 6500
|
||||
temperature: 0.8
|
||||
top_p: 1.0
|
||||
drop_long_outputs: false
|
||||
|
||||
|
||||
pipeline:
|
||||
sqs_queue_name: jake-pdf
|
||||
num_workers: 3
|
||||
generation_batch_size: 256
|
||||
tokenization_batch_size: 64
|
||||
output_serializer: default
|
||||
target_bucket: ai2-oe-data
|
||||
target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
|
||||
allowed_restarts_per_predictor: 10
|
||||
|
||||
task:
|
||||
budget: ai2/oe-data
|
||||
workspace: ai2/oe-data-model-based-cleanup
|
||||
name: qwen2vl-schedsteps-bg
|
||||
replicas: 128
|
||||
priority: LOW
|
||||
gpu_count: 1
|
||||
cluster:
|
||||
- ai2/jupiter-cirrascale-2
|
||||
- ai2/saturn-cirrascale
|
||||
|
||||
65
scripts/train/newtrainer-gantry.sh
Executable file
65
scripts/train/newtrainer-gantry.sh
Executable file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Use conda environment Python if available, otherwise use system Python
|
||||
if [ -n "$CONDA_PREFIX" ]; then
|
||||
PYTHON="$CONDA_PREFIX/bin/python"
|
||||
echo "Using conda Python from: $CONDA_PREFIX"
|
||||
else
|
||||
PYTHON="python"
|
||||
echo "Warning: No conda environment detected, using system Python"
|
||||
fi
|
||||
|
||||
# Get version from version.py
|
||||
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
|
||||
echo "OlmOCR version: $VERSION"
|
||||
|
||||
# Get first 10 characters of git hash
|
||||
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
|
||||
echo "Git hash: $GIT_HASH"
|
||||
|
||||
# Get current git branch name
|
||||
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
echo "Git branch: $GIT_BRANCH"
|
||||
|
||||
# Create full image tag
|
||||
IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
|
||||
echo "Building Docker image with tag: $IMAGE_TAG"
|
||||
|
||||
# Build the Docker image
|
||||
echo "Building Docker image..."
|
||||
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
|
||||
|
||||
# Get Beaker username
|
||||
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
|
||||
echo "Beaker user: $BEAKER_USER"
|
||||
|
||||
# Push image to beaker
|
||||
echo "Trying to push image to Beaker..."
|
||||
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
|
||||
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
|
||||
fi
|
||||
|
||||
gantry run \
|
||||
--description "${run_name}"\
|
||||
--task-name "${run_name}"\
|
||||
--allow-dirty \
|
||||
--host-networking \
|
||||
--workspace ai2/olmocr \
|
||||
--beaker-image $IMAGE_TAG \
|
||||
--pip gantry-train-requirements.txt \
|
||||
--priority normal \
|
||||
--gpus 8 \
|
||||
--preemptible \
|
||||
--cluster "ai2/jupiter-cirrascale-2" \
|
||||
--budget ai2/oe-data \
|
||||
--env LOG_FILTER_TYPE=local_rank0_only \
|
||||
--env OMP_NUM_THREADS=8 \
|
||||
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
|
||||
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
|
||||
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
|
||||
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
|
||||
--shared-memory 10GiB \
|
||||
--yes \
|
||||
-- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"
|
||||
Loading…
x
Reference in New Issue
Block a user