Add OCR quality measurement API

This commit is contained in:
James R. Barlow 2020-01-17 03:10:27 -08:00
parent 3831c4cd4d
commit ce97af5a79
5 changed files with 98 additions and 2 deletions

View File

@ -23,6 +23,7 @@ from .exceptions import (
DpiError,
EncryptedPdfError,
ExitCode,
ExitCodeException,
InputFileError,
MissingDependencyError,
OutputFileAccessError,

View File

@ -381,6 +381,7 @@ def run_pipeline(options, api=False):
detailed_page_analysis=options.redo_ocr,
progbar=options.progress_bar,
)
context = PDFContext(options, work_folder, origin_pdf, pdfinfo)
# Validate options are okay for this pdf

View File

@ -18,11 +18,10 @@
import logging
import os
import sys
import warnings
from contextlib import suppress
from enum import IntEnum
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List
from tqdm import tqdm

60
src/ocrmypdf/quality.py Normal file
View File

@ -0,0 +1,60 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import re
from typing import Iterable
"""Utilities to measure OCR quality"""
class OcrQualityDictionary:
"""Manages a dictionary for simple OCR quality checks."""
def __init__(self, *, wordlist: Iterable[str] = []):
"""Construct a dictionary from a list of words.
Words for which capitalization is important should be capitalized in the
dictionary. Words that contain spaces or other punctuation will never match.
"""
self.dictionary = set()
self.dictionary.update(w for w in wordlist)
def measure_words_matched(self, ocr_text: str) -> float:
"""Check how many unique words in the OCR text match a dictionary.
Words with mixed capitalized are only considered a match if the test word
matches that capitalization.
Returns:
number of words that match / number
"""
text = re.sub(r"[0-9_]+", ' ', ocr_text)
text = re.sub(r'\W+', ' ', text)
text_words_list = re.split(r'\s+', text)
text_words = {w for w in text_words_list if len(w) >= 3}
matches = 0
for w in text_words:
if w in self.dictionary or (
w != w.lower() and w.lower() in self.dictionary
):
matches += 1
if matches > 0:
hit_ratio = matches / len(text_words)
else:
hit_ratio = 0.0
return hit_ratio

35
tests/test_quality.py Normal file
View File

@ -0,0 +1,35 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import pytest
import ocrmypdf.quality as qual
def test_quality_measurement():
oqd = qual.OcrQualityDictionary(
wordlist=["words", "words", "quick", "brown", "fox", "dog", "lazy"]
)
assert len(oqd.dictionary) == 6 # 6 unique
assert (
oqd.measure_words_matched("The quick brown fox jumps quickly over the lazy dog")
== 0.5
)
assert oqd.measure_words_matched("12345 10% _f 7fox -brown | words") == 1.0
assert oqd.measure_words_matched("quick quick quick") == 1.0