mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-03 19:45:41 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			66 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			66 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import html
 | 
						|
import multiprocessing
 | 
						|
import os
 | 
						|
import time
 | 
						|
import unittest
 | 
						|
 | 
						|
from pdelfin.extract_text import get_document_text, get_page_text
 | 
						|
from pdelfin.filter.coherency import get_document_coherency
 | 
						|
 | 
						|
 | 
						|
class TestCoherencyScores(unittest.TestCase):
 | 
						|
    def testBadOcr1(self):
 | 
						|
        good_text = get_document_text(
 | 
						|
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf")
 | 
						|
        )
 | 
						|
        ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))
 | 
						|
        ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"))
 | 
						|
 | 
						|
        print("Good", get_document_coherency(good_text))
 | 
						|
        print("Bad1", get_document_coherency(ocr1_text))
 | 
						|
        print("Bad2", get_document_coherency(ocr2_text))
 | 
						|
 | 
						|
    def testHugeBookCoherencySpeed(self):
 | 
						|
        base_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "ti89_guidebook.pdf"))
 | 
						|
        print(f"ti89 book length: {len(base_text):,}")
 | 
						|
 | 
						|
        warmup = get_document_coherency(base_text[:1000])
 | 
						|
 | 
						|
        base_text = base_text[:40000]
 | 
						|
 | 
						|
        start = time.perf_counter()
 | 
						|
        score = get_document_coherency(base_text)
 | 
						|
        end = time.perf_counter()
 | 
						|
 | 
						|
        char_per_sec = len(base_text) / (end - start)
 | 
						|
        char_per_sec = char_per_sec / multiprocessing.cpu_count()
 | 
						|
 | 
						|
        print(f"ti89 book score {score:.2f}")
 | 
						|
        print(f"{char_per_sec:.2f} chars per second per core")
 | 
						|
 | 
						|
    def testTwoColumnMisparse(self):
 | 
						|
        pdftotext_text = get_page_text(
 | 
						|
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
 | 
						|
            page_num=2,
 | 
						|
            pdf_engine="pdftotext",
 | 
						|
        )
 | 
						|
        pymupdf_text = get_page_text(
 | 
						|
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
 | 
						|
            page_num=2,
 | 
						|
            pdf_engine="pymupdf",
 | 
						|
        )
 | 
						|
        pdfium_text = get_page_text(
 | 
						|
            os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
 | 
						|
            page_num=2,
 | 
						|
            pdf_engine="pdfium",
 | 
						|
        )
 | 
						|
 | 
						|
        # pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
 | 
						|
        # pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")
 | 
						|
 | 
						|
        print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
 | 
						|
        print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
 | 
						|
        print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
 | 
						|
 | 
						|
        self.assertLess(pdftotext_score, pymupdf_score)
 |