2024-09-18 22:52:42 +00:00
|
|
|
import html
|
|
|
|
import multiprocessing
|
2024-09-17 16:26:55 +00:00
|
|
|
import os
|
2024-09-17 19:59:02 +00:00
|
|
|
import time
|
2024-09-17 16:26:55 +00:00
|
|
|
import unittest
|
|
|
|
|
2025-01-27 18:30:41 +00:00
|
|
|
from olmocr.filter.coherency import get_document_coherency
|
|
|
|
from olmocr.prompts.anchor import get_anchor_text
|
2024-09-17 16:26:55 +00:00
|
|
|
|
2025-01-29 15:25:10 -08:00
|
|
|
|
2024-09-17 16:26:55 +00:00
|
|
|
class TestCoherencyScores(unittest.TestCase):
|
|
|
|
def testBadOcr1(self):
|
2025-01-29 15:30:39 -08:00
|
|
|
good_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"), 1, pdf_engine="pdftotext")
|
2024-10-10 16:41:19 +00:00
|
|
|
ocr1_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"), 1, pdf_engine="pdftotext")
|
|
|
|
ocr2_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"), 1, pdf_engine="pdftotext")
|
2024-09-17 16:26:55 +00:00
|
|
|
|
|
|
|
print("Good", get_document_coherency(good_text))
|
2024-09-17 16:58:45 +00:00
|
|
|
print("Bad1", get_document_coherency(ocr1_text))
|
|
|
|
print("Bad2", get_document_coherency(ocr2_text))
|
|
|
|
|
2025-01-29 15:42:34 -08:00
|
|
|
@unittest.skip("This test is not necessary, it's just a helpful benchmark")
|
2024-09-17 19:59:02 +00:00
|
|
|
def testHugeBookCoherencySpeed(self):
|
2024-10-10 16:41:19 +00:00
|
|
|
base_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"), 1, pdf_engine="pdftotext")
|
2024-09-17 19:59:02 +00:00
|
|
|
print(f"ti89 book length: {len(base_text):,}")
|
|
|
|
|
|
|
|
warmup = get_document_coherency(base_text[:1000])
|
|
|
|
|
|
|
|
base_text = base_text[:40000]
|
|
|
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
score = get_document_coherency(base_text)
|
|
|
|
end = time.perf_counter()
|
|
|
|
|
|
|
|
char_per_sec = len(base_text) / (end - start)
|
|
|
|
char_per_sec = char_per_sec / multiprocessing.cpu_count()
|
|
|
|
|
|
|
|
print(f"ti89 book score {score:.2f}")
|
|
|
|
print(f"{char_per_sec:.2f} chars per second per core")
|
2024-09-18 22:52:42 +00:00
|
|
|
|
2024-09-17 16:58:45 +00:00
|
|
|
def testTwoColumnMisparse(self):
|
2024-10-10 16:41:19 +00:00
|
|
|
pdftotext_text = get_anchor_text(
|
2024-09-18 22:52:42 +00:00
|
|
|
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
2024-10-10 16:41:19 +00:00
|
|
|
page=2,
|
2024-09-18 22:52:42 +00:00
|
|
|
pdf_engine="pdftotext",
|
|
|
|
)
|
2024-10-10 16:41:19 +00:00
|
|
|
pdfium_text = get_anchor_text(
|
2024-09-18 22:52:42 +00:00
|
|
|
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
2024-10-10 16:41:19 +00:00
|
|
|
page=2,
|
2024-09-18 22:52:42 +00:00
|
|
|
pdf_engine="pdfium",
|
|
|
|
)
|
|
|
|
|
2024-09-17 19:59:02 +00:00
|
|
|
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
|
|
|
|
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
|
|
|
|
2025-01-30 15:51:54 -08:00
|
|
|
self.assertLess(pdfium_score, pdftotext_score)
|
2024-10-01 20:19:03 +00:00
|
|
|
|
|
|
|
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
|
|
|
|
|
2025-01-30 15:51:54 -08:00
|
|
|
self.assertEqual(anchor_text, pdfium_text)
|