olmocr/tests/test_coherency.py

import html
import multiprocessing
import os
import time
import unittest

from olmocr.filter.coherency import get_document_coherency
from olmocr.prompts.anchor import get_anchor_text


class TestCoherencyScores(unittest.TestCase):
    def testBadOcr1(self):
        good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")
        ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
        ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")

        print("Good", get_document_coherency(good_text))
        print("Bad1", get_document_coherency(ocr1_text))
        print("Bad2", get_document_coherency(ocr2_text))

    @unittest.skip("This test is not necessary, it's just a helpful benchmark")
    def testHugeBookCoherencySpeed(self):
        base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
        print(f"ti89 book length: {len(base_text):,}")

        warmup = get_document_coherency(base_text[:1000])

        base_text = base_text[:40000]

        start = time.perf_counter()
        score = get_document_coherency(base_text)
        end = time.perf_counter()

        char_per_sec = len(base_text) / (end - start)
        char_per_sec = char_per_sec / multiprocessing.cpu_count()

        print(f"ti89 book score {score:.2f}")
        print(f"{char_per_sec:.2f} chars per second per core")

    def testTwoColumnMisparse(self):
        pdftotext_text = get_anchor_text(
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
            page=2,
            pdf_engine="pdftotext",
        )
        pdfium_text = get_anchor_text(
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
            page=2,
            pdf_engine="pdfium",
        )

        print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
        print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))

        self.assertLess(pdfium_score, pdftotext_score)

        anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")

        self.assertEqual(anchor_text, pdfium_text)
Formatting 2024-09-18 22:52:42 +00:00			`import html`
			`import multiprocessing`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00			`import os`
450tok/sec/core with smollm that appears to work well 2024-09-17 19:59:02 +00:00			`import time`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00			`import unittest`

Massive refactor from pdelfin to olmocr 2025-01-27 18:30:41 +00:00			`from olmocr.filter.coherency import get_document_coherency`
			`from olmocr.prompts.anchor import get_anchor_text`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00
isort 2025-01-29 15:25:10 -08:00
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00			`class TestCoherencyScores(unittest.TestCase):`
			`def testBadOcr1(self):`
Black formatting 2025-01-29 15:30:39 -08:00			`good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")`
			`ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00
			`print("Good", get_document_coherency(good_text))`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00			`print("Bad1", get_document_coherency(ocr1_text))`
			`print("Bad2", get_document_coherency(ocr2_text))`

Isort and black update 2025-01-29 15:42:34 -08:00			`@unittest.skip("This test is not necessary, it's just a helpful benchmark")`
450tok/sec/core with smollm that appears to work well 2024-09-17 19:59:02 +00:00			`def testHugeBookCoherencySpeed(self):`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")`
450tok/sec/core with smollm that appears to work well 2024-09-17 19:59:02 +00:00			`print(f"ti89 book length: {len(base_text):,}")`

			`warmup = get_document_coherency(base_text[:1000])`

			`base_text = base_text[:40000]`

			`start = time.perf_counter()`
			`score = get_document_coherency(base_text)`
			`end = time.perf_counter()`

			`char_per_sec = len(base_text) / (end - start)`
			`char_per_sec = char_per_sec / multiprocessing.cpu_count()`

			`print(f"ti89 book score {score:.2f}")`
			`print(f"{char_per_sec:.2f} chars per second per core")`
Formatting 2024-09-18 22:52:42 +00:00
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00			`def testTwoColumnMisparse(self):`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`pdftotext_text = get_anchor_text(`
Formatting 2024-09-18 22:52:42 +00:00			`os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`page=2,`
Formatting 2024-09-18 22:52:42 +00:00			`pdf_engine="pdftotext",`
			`)`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`pdfium_text = get_anchor_text(`
Formatting 2024-09-18 22:52:42 +00:00			`os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),`
Fix up some tests but I don't see why this isn't working 2024-10-10 16:41:19 +00:00			`page=2,`
Formatting 2024-09-18 22:52:42 +00:00			`pdf_engine="pdfium",`
			`)`

450tok/sec/core with smollm that appears to work well 2024-09-17 19:59:02 +00:00			`print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))`
			`print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))`

Removing pymupdf 2025-01-30 15:51:54 -08:00			`self.assertLess(pdfium_score, pdftotext_score)`
coherency based anchor text 2024-10-01 20:19:03 +00:00
			`anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")`

Removing pymupdf 2025-01-30 15:51:54 -08:00			`self.assertEqual(anchor_text, pdfium_text)`