Add OCR quality measurement API

2026-01-06 12:11:18 +00:00 · 2020-01-17 03:10:27 -08:00 · 2020-01-17 03:10:27 -08:00 · ce97af5a79
commit ce97af5a79
parent 3831c4cd4d
5 changed files with 98 additions and 2 deletions
--- a/src/ocrmypdf/init.py
+++ b/src/ocrmypdf/init.py
@ -23,6 +23,7 @@ from .exceptions import (
    DpiError,
    EncryptedPdfError,
    ExitCode,
+    ExitCodeException,
    InputFileError,
    MissingDependencyError,
    OutputFileAccessError,
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@ -381,6 +381,7 @@ def run_pipeline(options, api=False):
            detailed_page_analysis=options.redo_ocr,
            progbar=options.progress_bar,
        )
+
        context = PDFContext(options, work_folder, origin_pdf, pdfinfo)

        # Validate options are okay for this pdf
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@ -18,11 +18,10 @@
 import logging
 import os
 import sys
-import warnings
 from contextlib import suppress
 from enum import IntEnum
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List

 from tqdm import tqdm

--- a/src/ocrmypdf/quality.py
+++ b/src/ocrmypdf/quality.py
@ -0,0 +1,60 @@
+# © 2020 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from typing import Iterable
+
+"""Utilities to measure OCR quality"""
+
+
+class OcrQualityDictionary:
+    """Manages a dictionary for simple OCR quality checks."""
+
+    def __init__(self, *, wordlist: Iterable[str] = []):
+        """Construct a dictionary from a list of words.
+
+        Words for which capitalization is important should be capitalized in the
+        dictionary. Words that contain spaces or other punctuation will never match.
+        """
+        self.dictionary = set()
+        self.dictionary.update(w for w in wordlist)
+
+    def measure_words_matched(self, ocr_text: str) -> float:
+        """Check how many unique words in the OCR text match a dictionary.
+
+        Words with mixed capitalized are only considered a match if the test word
+        matches that capitalization.
+
+        Returns:
+            number of words that match / number
+        """
+        text = re.sub(r"[0-9_]+", ' ', ocr_text)
+        text = re.sub(r'\W+', ' ', text)
+        text_words_list = re.split(r'\s+', text)
+        text_words = {w for w in text_words_list if len(w) >= 3}
+
+        matches = 0
+        for w in text_words:
+            if w in self.dictionary or (
+                w != w.lower() and w.lower() in self.dictionary
+            ):
+                matches += 1
+        if matches > 0:
+            hit_ratio = matches / len(text_words)
+        else:
+            hit_ratio = 0.0
+        return hit_ratio
--- a/tests/test_quality.py
+++ b/tests/test_quality.py
@ -0,0 +1,35 @@
+# © 2020 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import pytest
+
+import ocrmypdf.quality as qual
+
+
+def test_quality_measurement():
+    oqd = qual.OcrQualityDictionary(
+        wordlist=["words", "words", "quick", "brown", "fox", "dog", "lazy"]
+    )
+    assert len(oqd.dictionary) == 6  # 6 unique
+
+    assert (
+        oqd.measure_words_matched("The quick brown fox jumps quickly over the lazy dog")
+        == 0.5
+    )
+    assert oqd.measure_words_matched("12345 10% _f  7fox -brown   | words") == 1.0
+
+    assert oqd.measure_words_matched("quick quick quick") == 1.0