olmocr/tests/test_coherency.py

import os

import unittest

from pdelfin.filter.coherency import get_document_coherency
from pdelfin.extract_text import get_document_text, get_page_text


class TestCoherencyScores(unittest.TestCase):
    def testBadOcr1(self):
        good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"))
        ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))
        ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"))

        print("Good", get_document_coherency(good_text))
        print("Bad1", get_document_coherency(ocr1_text))
        print("Bad2", get_document_coherency(ocr2_text))

    def testTwoColumnMisparse(self):
        pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")
        pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")
        pdfium_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdfium")
        
        # pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
        # pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")

        print("pdftotext_text", get_document_coherency(pdftotext_text))
        print("pymupdf_text", get_document_coherency(pymupdf_text))
        print("pdfium_text", get_document_coherency(pdfium_text))
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00			`import os`

			`import unittest`

			`from pdelfin.filter.coherency import get_document_coherency`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00			`from pdelfin.extract_text import get_document_text, get_page_text`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00

			`class TestCoherencyScores(unittest.TestCase):`
			`def testBadOcr1(self):`
			`good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf"))`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00			`ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf"))`
			`ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf"))`
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00
			`print("Good", get_document_coherency(good_text))`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00			`print("Bad1", get_document_coherency(ocr1_text))`
			`print("Bad2", get_document_coherency(ocr2_text))`

			`def testTwoColumnMisparse(self):`
			`pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext")`
			`pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf")`
Using SmolLM, seems a lot better and is able to pass some tests 2024-09-17 18:47:27 +00:00			`pdfium_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdfium")`

			`# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")`
			`# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00
			`print("pdftotext_text", get_document_coherency(pdftotext_text))`
			`print("pymupdf_text", get_document_coherency(pymupdf_text))`
Using SmolLM, seems a lot better and is able to pass some tests 2024-09-17 18:47:27 +00:00			`print("pdfium_text", get_document_coherency(pdfium_text))`
Testing coherence with distilgpt2, but it doesn't work great 2024-09-17 16:58:45 +00:00