From 57e80aacd266d1765a8b60a55efefe42e52cc11e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 17 Sep 2024 16:58:45 +0000 Subject: [PATCH] Testing coherence with distilgpt2, but it doesn't work great --- pdelfin/filter/coherency.py | 10 ---------- tests/test_coherency.py | 17 ++++++++++++++--- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pdelfin/filter/coherency.py b/pdelfin/filter/coherency.py index 67d502f..3bd72d9 100644 --- a/pdelfin/filter/coherency.py +++ b/pdelfin/filter/coherency.py @@ -4,16 +4,6 @@ import torch @lru_cache() def load_coherency_model(model_name: str = "distilgpt2"): - """ - Loads the tokenizer and model, caching the result to avoid redundant loads. - - Args: - model_name (str): The name of the pretrained model to load. - - Returns: - tokenizer: The tokenizer associated with the model. - model: The pretrained causal language model. - """ tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) model.eval() # Set the model to evaluation mode diff --git a/tests/test_coherency.py b/tests/test_coherency.py index cddfb39..7df5396 100644 --- a/tests/test_coherency.py +++ b/tests/test_coherency.py @@ -3,13 +3,24 @@ import os import unittest from pdelfin.filter.coherency import get_document_coherency -from pdelfin.extract_text import get_document_text +from pdelfin.extract_text import get_document_text, get_page_text class TestCoherencyScores(unittest.TestCase): def testBadOcr1(self): good_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "instructions_and_schematics.pdf")) - bad_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf")) + ocr1_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "handwriting_bad_ocr.pdf")) + ocr2_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")) print("Good", get_document_coherency(good_text)) - print("Bad", get_document_coherency(bad_text)) \ No newline at end of file + print("Bad1", get_document_coherency(ocr1_text)) + print("Bad2", get_document_coherency(ocr2_text)) + + def testTwoColumnMisparse(self): + pdftotext_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pdftotext") + pymupdf_text = get_page_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), page_num=2, pdf_engine="pymupdf") + + print("pdftotext_text", get_document_coherency(pdftotext_text)) + print("pymupdf_text", get_document_coherency(pymupdf_text)) + + \ No newline at end of file