mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-16 04:42:39 +00:00
Hopefully CI runs now
This commit is contained in:
parent
15f9b8b9dc
commit
c05e01532c
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@ -31,7 +31,7 @@ jobs:
|
|||||||
task:
|
task:
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
pytest -v --color=yes tests/
|
pytest -v --color=yes -m "not nonci" tests/
|
||||||
|
|
||||||
include:
|
include:
|
||||||
- python: "3.11"
|
- python: "3.11"
|
||||||
@ -39,7 +39,7 @@ jobs:
|
|||||||
name: Lint
|
name: Lint
|
||||||
run: ruff check .
|
run: ruff check .
|
||||||
|
|
||||||
# Removing mypy for now, as it isn't handling async things correctly
|
# Removing mypy for now, as it isn't handling async things correctly and crashing
|
||||||
# - python: "3.11"
|
# - python: "3.11"
|
||||||
# task:
|
# task:
|
||||||
# name: Type check
|
# name: Type check
|
||||||
|
@ -158,3 +158,6 @@ python_classes = [
|
|||||||
]
|
]
|
||||||
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
||||||
log_level = "DEBUG"
|
log_level = "DEBUG"
|
||||||
|
markers = [
|
||||||
|
"nonci: mark test as not intended for CI runs"
|
||||||
|
]
|
@ -1,59 +0,0 @@
|
|||||||
import html
|
|
||||||
import multiprocessing
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from olmocr.filter.coherency import get_document_coherency
|
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
|
||||||
|
|
||||||
|
|
||||||
class TestCoherencyScores(unittest.TestCase):
|
|
||||||
def testBadOcr1(self):
|
|
||||||
good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")
|
|
||||||
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
|
|
||||||
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
|
|
||||||
|
|
||||||
print("Good", get_document_coherency(good_text))
|
|
||||||
print("Bad1", get_document_coherency(ocr1_text))
|
|
||||||
print("Bad2", get_document_coherency(ocr2_text))
|
|
||||||
|
|
||||||
@unittest.skip("This test is not necessary, it's just a helpful benchmark")
|
|
||||||
def testHugeBookCoherencySpeed(self):
|
|
||||||
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
|
|
||||||
print(f"ti89 book length: {len(base_text):,}")
|
|
||||||
|
|
||||||
warmup = get_document_coherency(base_text[:1000])
|
|
||||||
|
|
||||||
base_text = base_text[:40000]
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
|
||||||
score = get_document_coherency(base_text)
|
|
||||||
end = time.perf_counter()
|
|
||||||
|
|
||||||
char_per_sec = len(base_text) / (end - start)
|
|
||||||
char_per_sec = char_per_sec / multiprocessing.cpu_count()
|
|
||||||
|
|
||||||
print(f"ti89 book score {score:.2f}")
|
|
||||||
print(f"{char_per_sec:.2f} chars per second per core")
|
|
||||||
|
|
||||||
def testTwoColumnMisparse(self):
|
|
||||||
pdftotext_text = get_anchor_text(
|
|
||||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
|
||||||
page=2,
|
|
||||||
pdf_engine="pdftotext",
|
|
||||||
)
|
|
||||||
pdfium_text = get_anchor_text(
|
|
||||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
|
||||||
page=2,
|
|
||||||
pdf_engine="pdfium",
|
|
||||||
)
|
|
||||||
|
|
||||||
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
|
|
||||||
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
|
||||||
|
|
||||||
self.assertLess(pdfium_score, pdftotext_score)
|
|
||||||
|
|
||||||
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
|
|
||||||
|
|
||||||
self.assertEqual(anchor_text, pdfium_text)
|
|
@ -1,6 +1,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
|
import pytest
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import AutoProcessor
|
from transformers import AutoProcessor
|
||||||
@ -14,6 +15,7 @@ from olmocr.train.dataloader import (
|
|||||||
from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training
|
from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nonci
|
||||||
class TestBatchQueryResponseDataset(unittest.TestCase):
|
class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||||
def testLoadS3(self):
|
def testLoadS3(self):
|
||||||
ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
|
ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
|
||||||
|
@ -7,6 +7,7 @@ from io import BytesIO
|
|||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -27,6 +28,7 @@ from olmocr.train.dataprep import (
|
|||||||
from olmocr.train.utils import make_dataset
|
from olmocr.train.utils import make_dataset
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nonci
|
||||||
class TestDataprep(unittest.TestCase):
|
class TestDataprep(unittest.TestCase):
|
||||||
def testFullDataloader(self):
|
def testFullDataloader(self):
|
||||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import (
|
from transformers import (
|
||||||
@ -10,6 +11,7 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nonci
|
||||||
class MolmoProcessorTest(unittest.TestCase):
|
class MolmoProcessorTest(unittest.TestCase):
|
||||||
def test_molmo_demo(self):
|
def test_molmo_demo(self):
|
||||||
# load the processor
|
# load the processor
|
||||||
|
@ -8,13 +8,13 @@ import base64
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import tempfile
|
|
||||||
import unittest
|
import unittest
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import AsyncMock, patch
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from httpx import AsyncClient
|
from httpx import AsyncClient
|
||||||
@ -36,7 +36,7 @@ MODEL_FINETUNED_PATH = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@unittest.skip("Skip these tests when running CI, they are mostly for experimentation")
|
@pytest.mark.nonci
|
||||||
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
||||||
async def asyncSetUp(self):
|
async def asyncSetUp(self):
|
||||||
# Mock arguments
|
# Mock arguments
|
||||||
@ -110,6 +110,7 @@ class TestSglangServer(unittest.IsolatedAsyncioTestCase):
|
|||||||
# os.rmdir(self.args.workspace)
|
# os.rmdir(self.args.workspace)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nonci
|
||||||
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
||||||
async def asyncSetUp(self):
|
async def asyncSetUp(self):
|
||||||
# Set up the Hugging Face model and tokenizer
|
# Set up the Hugging Face model and tokenizer
|
||||||
@ -248,6 +249,7 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase):
|
|||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.nonci
|
||||||
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
|
class RawSGLangTest(unittest.IsolatedAsyncioTestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
# Set up the Hugging Face model and tokenizer
|
# Set up the Hugging Face model and tokenizer
|
||||||
|
Loading…
x
Reference in New Issue
Block a user