mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-06 12:11:18 +00:00
Add OCR quality measurement API
This commit is contained in:
parent
3831c4cd4d
commit
ce97af5a79
@ -23,6 +23,7 @@ from .exceptions import (
|
||||
DpiError,
|
||||
EncryptedPdfError,
|
||||
ExitCode,
|
||||
ExitCodeException,
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
|
||||
@ -381,6 +381,7 @@ def run_pipeline(options, api=False):
|
||||
detailed_page_analysis=options.redo_ocr,
|
||||
progbar=options.progress_bar,
|
||||
)
|
||||
|
||||
context = PDFContext(options, work_folder, origin_pdf, pdfinfo)
|
||||
|
||||
# Validate options are okay for this pdf
|
||||
|
||||
@ -18,11 +18,10 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
60
src/ocrmypdf/quality.py
Normal file
60
src/ocrmypdf/quality.py
Normal file
@ -0,0 +1,60 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
from typing import Iterable
|
||||
|
||||
"""Utilities to measure OCR quality"""
|
||||
|
||||
|
||||
class OcrQualityDictionary:
|
||||
"""Manages a dictionary for simple OCR quality checks."""
|
||||
|
||||
def __init__(self, *, wordlist: Iterable[str] = []):
|
||||
"""Construct a dictionary from a list of words.
|
||||
|
||||
Words for which capitalization is important should be capitalized in the
|
||||
dictionary. Words that contain spaces or other punctuation will never match.
|
||||
"""
|
||||
self.dictionary = set()
|
||||
self.dictionary.update(w for w in wordlist)
|
||||
|
||||
def measure_words_matched(self, ocr_text: str) -> float:
|
||||
"""Check how many unique words in the OCR text match a dictionary.
|
||||
|
||||
Words with mixed capitalized are only considered a match if the test word
|
||||
matches that capitalization.
|
||||
|
||||
Returns:
|
||||
number of words that match / number
|
||||
"""
|
||||
text = re.sub(r"[0-9_]+", ' ', ocr_text)
|
||||
text = re.sub(r'\W+', ' ', text)
|
||||
text_words_list = re.split(r'\s+', text)
|
||||
text_words = {w for w in text_words_list if len(w) >= 3}
|
||||
|
||||
matches = 0
|
||||
for w in text_words:
|
||||
if w in self.dictionary or (
|
||||
w != w.lower() and w.lower() in self.dictionary
|
||||
):
|
||||
matches += 1
|
||||
if matches > 0:
|
||||
hit_ratio = matches / len(text_words)
|
||||
else:
|
||||
hit_ratio = 0.0
|
||||
return hit_ratio
|
||||
35
tests/test_quality.py
Normal file
35
tests/test_quality.py
Normal file
@ -0,0 +1,35 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import pytest
|
||||
|
||||
import ocrmypdf.quality as qual
|
||||
|
||||
|
||||
def test_quality_measurement():
|
||||
oqd = qual.OcrQualityDictionary(
|
||||
wordlist=["words", "words", "quick", "brown", "fox", "dog", "lazy"]
|
||||
)
|
||||
assert len(oqd.dictionary) == 6 # 6 unique
|
||||
|
||||
assert (
|
||||
oqd.measure_words_matched("The quick brown fox jumps quickly over the lazy dog")
|
||||
== 0.5
|
||||
)
|
||||
assert oqd.measure_words_matched("12345 10% _f 7fox -brown | words") == 1.0
|
||||
|
||||
assert oqd.measure_words_matched("quick quick quick") == 1.0
|
||||
Loading…
x
Reference in New Issue
Block a user