olmocr/tests/test_integration.py

29 lines
872 B
Python
Raw Normal View History

2025-03-14 02:43:55 +00:00
import glob
import json
import os
import unittest
import pytest
@pytest.mark.nonci
class TestPipelineIntegration(unittest.TestCase):
def setUp(self):
self.data = []
for file in glob.glob(os.path.join("localworkspace", "results", "*.jsonl")):
with open(file, "r") as jf:
for line in jf:
if len(line.strip()) > 0:
self.data.append(json.loads(line))
2025-03-14 02:52:13 +00:00
print(self.data[-1])
2025-03-14 02:43:55 +00:00
def test_edgar(self) -> None:
self.assertTrue(any("King of the English" in line["text"] for line in self.data))
2025-03-14 02:43:55 +00:00
def test_ambig(self) -> None:
self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))
def test_dolma(self) -> None:
self.assertTrue(any("We extensively document Dolma" in line["text"] for line in self.data))