Hopefully CI runs now

This commit is contained in:
Jake Poznanski 2025-02-14 20:42:19 +00:00
parent 15f9b8b9dc
commit c05e01532c
7 changed files with 15 additions and 63 deletions

View File

@ -31,7 +31,7 @@ jobs:
task:
- name: Test
run: |
pytest -v --color=yes tests/
pytest -v --color=yes -m "not nonci" tests/
include:
- python: "3.11"
@ -39,7 +39,7 @@ jobs:
name: Lint
run: ruff check .
# Removing mypy for now, as it isn't handling async things correctly
# Removing mypy for now, as it isn't handling async things correctly and crashing
# - python: "3.11"
# task:
# name: Type check

View File

@ -158,3 +158,6 @@ python_classes = [
]
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
log_level = "DEBUG"
markers = [
"nonci: mark test as not intended for CI runs"
]

View File

@ -1,59 +0,0 @@
import html
import multiprocessing
import os
import time
import unittest
from olmocr.filter.coherency import get_document_coherency
from olmocr.prompts.anchor import get_anchor_text
class TestCoherencyScores(unittest.TestCase):
def testBadOcr1(self):
good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
print("Good", get_document_coherency(good_text))
print("Bad1", get_document_coherency(ocr1_text))
print("Bad2", get_document_coherency(ocr2_text))
@unittest.skip("This test is not necessary, it's just a helpful benchmark")
def testHugeBookCoherencySpeed(self):
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
print(f"ti89 book length: {len(base_text):,}")
warmup = get_document_coherency(base_text[:1000])
base_text = base_text[:40000]
start = time.perf_counter()
score = get_document_coherency(base_text)
end = time.perf_counter()
char_per_sec = len(base_text) / (end - start)
char_per_sec = char_per_sec / multiprocessing.cpu_count()
print(f"ti89 book score {score:.2f}")
print(f"{char_per_sec:.2f} chars per second per core")
def testTwoColumnMisparse(self):
pdftotext_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pdftotext",
)
pdfium_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pdfium",
)
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
self.assertLess(pdfium_score, pdftotext_score)
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
self.assertEqual(anchor_text, pdfium_text)

View File

@ -1,6 +1,7 @@
import unittest
from functools import partial
import pytest
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoProcessor
@ -14,6 +15,7 @@ from olmocr.train.dataloader import (
from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training
@pytest.mark.nonci
class TestBatchQueryResponseDataset(unittest.TestCase):
def testLoadS3(self):
ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)

View File

@ -7,6 +7,7 @@ from io import BytesIO
from unittest.mock import patch
import numpy as np
import pytest
import requests
import torch
from PIL import Image
@ -27,6 +28,7 @@ from olmocr.train.dataprep import (
from olmocr.train.utils import make_dataset
@pytest.mark.nonci
class TestDataprep(unittest.TestCase):
def testFullDataloader(self):
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

View File

@ -1,5 +1,6 @@
import unittest
import pytest
import requests
from PIL import Image
from transformers import (
@ -10,6 +11,7 @@ from transformers import (
)
@pytest.mark.nonci
class MolmoProcessorTest(unittest.TestCase):
def test_molmo_demo(self):
# load the processor

View File

@ -8,13 +8,13 @@ import base64
import json
import math
import os
import tempfile
import unittest
from io import BytesIO
from pathlib import Path
from unittest.mock import AsyncMock, patch
import numpy as np
import pytest
import torch
import torch.nn.functional as F
from httpx import AsyncClient
@ -36,7 +36,7 @@ MODEL_FINETUNED_PATH = (
)
@unittest.skip("Skip these tests when running CI, they are mostly for experimentation")
@pytest.mark.nonci
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
# Mock arguments
@ -110,6 +110,7 @@ class TestSglangServer(unittest.IsolatedAsyncioTestCase):
# os.rmdir(self.args.workspace)
@pytest.mark.nonci
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
# Set up the Hugging Face model and tokenizer
@ -248,6 +249,7 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
torch.cuda.empty_cache()
@pytest.mark.nonci
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
def setUp(self):
# Set up the Hugging Face model and tokenizer