diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3cc2434..f49b48f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -112,7 +112,7 @@ jobs: needs: [checks] env: BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }} - BEAKER_IMAGE: chrisw/olmocr-gpu-ci + BEAKER_IMAGE: jakep/olmocr-gpu-ci BEAKER_BUDGET: ai2/oe-data BEAKER_WORKSPACE: ai2/olmocr steps: diff --git a/scripts/beaker/gpu-ci-script.sh b/scripts/beaker/gpu-ci-script.sh index bc216fb..22f574c 100755 --- a/scripts/beaker/gpu-ci-script.sh +++ b/scripts/beaker/gpu-ci-script.sh @@ -9,7 +9,7 @@ git clone https://github.com/allenai/olmocr.git olmocr \ .[gpu] \ pytest \ --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ \ - && bash tests/gnarly_pdfs/test_gnarly_pdfs.sh + && bash scripts/run_integration_test.sh diff --git a/scripts/run_integration_test.sh b/scripts/run_integration_test.sh new file mode 100644 index 0000000..be3332d --- /dev/null +++ b/scripts/run_integration_test.sh @@ -0,0 +1,6 @@ +#/usr/bin/bash + +set -ex + +python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf tests/gnarly_pdfs/edgar.pdf tests/gnarly_pdfs/dolma-page-1.pdf \ + && pytest tests/test_integration.py diff --git a/tests/gnarly_pdfs/test_gnarly_pdfs.py b/tests/gnarly_pdfs/test_gnarly_pdfs.py deleted file mode 100644 index 38aad28..0000000 --- a/tests/gnarly_pdfs/test_gnarly_pdfs.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - - -class TestGnarlyPdfs(unittest.TestCase): - def test_nothing_in_particular(self) -> None: - """Noop pending jake's impl""" - self.assertTrue(True) diff --git a/tests/gnarly_pdfs/test_gnarly_pdfs.sh b/tests/gnarly_pdfs/test_gnarly_pdfs.sh deleted file mode 100644 index 13f18ac..0000000 --- a/tests/gnarly_pdfs/test_gnarly_pdfs.sh +++ /dev/null @@ -1,6 +0,0 @@ -#/usr/bin/bash - -set -ex - -python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf \ - && pytest tests/gnarly_pdfs/test_gnarly_pdfs.py diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..fe8ceeb --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,27 @@ +import glob +import json +import os +import unittest + +import pytest + + +@pytest.mark.nonci +class TestPipelineIntegration(unittest.TestCase): + def setUp(self): + self.data = [] + + for file in glob.glob(os.path.join("localworkspace", "results", "*.jsonl")): + with open(file, "r") as jf: + for line in jf: + if len(line.strip()) > 0: + self.data.append(json.loads(line)) + + def test_edgar(self) -> None: + self.assertTrue(any("King of England" in line["text"] for line in self.data)) + + def test_ambig(self) -> None: + self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data)) + + def test_dolma(self) -> None: + self.assertTrue(any("We extensively document Dolma" in line["text"] for line in self.data))