Trying to get new CI to work

This commit is contained in:
Jake Poznanski 2025-03-14 02:43:55 +00:00
parent 1db1b3406b
commit f5d92bdb14
6 changed files with 35 additions and 15 deletions

View File

@ -112,7 +112,7 @@ jobs:
needs: [checks]
env:
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
BEAKER_IMAGE: chrisw/olmocr-gpu-ci
BEAKER_IMAGE: jakep/olmocr-gpu-ci
BEAKER_BUDGET: ai2/oe-data
BEAKER_WORKSPACE: ai2/olmocr
steps:

View File

@ -9,7 +9,7 @@ git clone https://github.com/allenai/olmocr.git olmocr \
.[gpu] \
pytest \
--find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ \
&& bash tests/gnarly_pdfs/test_gnarly_pdfs.sh
&& bash scripts/run_integration_test.sh

View File

@ -0,0 +1,6 @@
#/usr/bin/bash
set -ex
python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf tests/gnarly_pdfs/edgar.pdf tests/gnarly_pdfs/dolma-page-1.pdf \
&& pytest tests/test_integration.py

View File

@ -1,7 +0,0 @@
import unittest
class TestGnarlyPdfs(unittest.TestCase):
def test_nothing_in_particular(self) -> None:
"""Noop pending jake's impl"""
self.assertTrue(True)

View File

@ -1,6 +0,0 @@
#/usr/bin/bash
set -ex
python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf \
&& pytest tests/gnarly_pdfs/test_gnarly_pdfs.py

27
tests/test_integration.py Normal file
View File

@ -0,0 +1,27 @@
import glob
import json
import os
import unittest
import pytest
@pytest.mark.nonci
class TestPipelineIntegration(unittest.TestCase):
def setUp(self):
self.data = []
for file in glob.glob(os.path.join("localworkspace", "results", "*.jsonl")):
with open(file, "r") as jf:
for line in jf:
if len(line.strip()) > 0:
self.data.append(json.loads(line))
def test_edgar(self) -> None:
self.assertTrue(any("King of England" in line["text"] for line in self.data))
def test_ambig(self) -> None:
self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))
def test_dolma(self) -> None:
self.assertTrue(any("We extensively document Dolma" in line["text"] for line in self.data))