mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-15 12:21:44 +00:00
Hopefully CI runs now
This commit is contained in:
parent
15f9b8b9dc
commit
c05e01532c
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@ -31,7 +31,7 @@ jobs:
|
||||
task:
|
||||
- name: Test
|
||||
run: |
|
||||
pytest -v --color=yes tests/
|
||||
pytest -v --color=yes -m "not nonci" tests/
|
||||
|
||||
include:
|
||||
- python: "3.11"
|
||||
@ -39,7 +39,7 @@ jobs:
|
||||
name: Lint
|
||||
run: ruff check .
|
||||
|
||||
# Removing mypy for now, as it isn't handling async things correctly
|
||||
# Removing mypy for now, as it isn't handling async things correctly and crashing
|
||||
# - python: "3.11"
|
||||
# task:
|
||||
# name: Type check
|
||||
|
@ -158,3 +158,6 @@ python_classes = [
|
||||
]
|
||||
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
||||
log_level = "DEBUG"
|
||||
markers = [
|
||||
"nonci: mark test as not intended for CI runs"
|
||||
]
|
@ -1,59 +0,0 @@
|
||||
import html
|
||||
import multiprocessing
|
||||
import os
|
||||
import time
|
||||
import unittest
|
||||
|
||||
from olmocr.filter.coherency import get_document_coherency
|
||||
from olmocr.prompts.anchor import get_anchor_text
|
||||
|
||||
|
||||
class TestCoherencyScores(unittest.TestCase):
|
||||
def testBadOcr1(self):
|
||||
good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")
|
||||
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
|
||||
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
|
||||
|
||||
print("Good", get_document_coherency(good_text))
|
||||
print("Bad1", get_document_coherency(ocr1_text))
|
||||
print("Bad2", get_document_coherency(ocr2_text))
|
||||
|
||||
@unittest.skip("This test is not necessary, it's just a helpful benchmark")
|
||||
def testHugeBookCoherencySpeed(self):
|
||||
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
|
||||
print(f"ti89 book length: {len(base_text):,}")
|
||||
|
||||
warmup = get_document_coherency(base_text[:1000])
|
||||
|
||||
base_text = base_text[:40000]
|
||||
|
||||
start = time.perf_counter()
|
||||
score = get_document_coherency(base_text)
|
||||
end = time.perf_counter()
|
||||
|
||||
char_per_sec = len(base_text) / (end - start)
|
||||
char_per_sec = char_per_sec / multiprocessing.cpu_count()
|
||||
|
||||
print(f"ti89 book score {score:.2f}")
|
||||
print(f"{char_per_sec:.2f} chars per second per core")
|
||||
|
||||
def testTwoColumnMisparse(self):
|
||||
pdftotext_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page=2,
|
||||
pdf_engine="pdftotext",
|
||||
)
|
||||
pdfium_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page=2,
|
||||
pdf_engine="pdfium",
|
||||
)
|
||||
|
||||
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
|
||||
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
||||
|
||||
self.assertLess(pdfium_score, pdftotext_score)
|
||||
|
||||
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
|
||||
|
||||
self.assertEqual(anchor_text, pdfium_text)
|
@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoProcessor
|
||||
@ -14,6 +15,7 @@ from olmocr.train.dataloader import (
|
||||
from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
def testLoadS3(self):
|
||||
ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
|
||||
|
@ -7,6 +7,7 @@ from io import BytesIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
@ -27,6 +28,7 @@ from olmocr.train.dataprep import (
|
||||
from olmocr.train.utils import make_dataset
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class TestDataprep(unittest.TestCase):
|
||||
def testFullDataloader(self):
|
||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||||
|
@ -1,5 +1,6 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
@ -10,6 +11,7 @@ from transformers import (
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class MolmoProcessorTest(unittest.TestCase):
|
||||
def test_molmo_demo(self):
|
||||
# load the processor
|
||||
|
@ -8,13 +8,13 @@ import base64
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from httpx import AsyncClient
|
||||
@ -36,7 +36,7 @@ MODEL_FINETUNED_PATH = (
|
||||
)
|
||||
|
||||
|
||||
@unittest.skip("Skip these tests when running CI, they are mostly for experimentation")
|
||||
@pytest.mark.nonci
|
||||
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
# Mock arguments
|
||||
@ -110,6 +110,7 @@ class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
||||
# os.rmdir(self.args.workspace)
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
# Set up the Hugging Face model and tokenizer
|
||||
@ -248,6 +249,7 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.mark.nonci
|
||||
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
|
||||
def setUp(self):
|
||||
# Set up the Hugging Face model and tokenizer
|
||||
|
Loading…
x
Reference in New Issue
Block a user