Accept most of ruff's delinting

This commit is contained in:
James R. Barlow 2023-04-14 00:38:34 -07:00
parent b7eb93eb79
commit 9b8d14d16e
No known key found for this signature in database
GPG Key ID: E54A300D567E1260
47 changed files with 121 additions and 172 deletions

View File

@ -6,7 +6,6 @@ from __future__ import annotations
# This script must be edited to meet your needs.
import logging
import os
import sys
from pathlib import Path

View File

@ -1,8 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83
# SPDX-License-Identifier: MIT
"""
An example of an OCRmyPDF plugin.
"""An example of an OCRmyPDF plugin.
This plugin adds two new command line arguments
--grayscale-ocr: converts the image to grayscale before performing OCR on it

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: 2017 Enantiomerie
# SPDX-License-Identifier: MIT
"""Example OCRmyPDF for Synology NAS"""
"""Example OCRmyPDF for Synology NAS."""
from __future__ import annotations

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: 2019 James R. Barlow
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is a simple web service/HTTP wrapper for OCRmyPDF
"""This is a simple web service/HTTP wrapper for OCRmyPDF.
This may be more convenient than the command line tool for some Docker users.
Note that OCRmyPDF uses Ghostscript, which is licensed under AGPLv3+. While
@ -15,7 +15,7 @@ from __future__ import annotations
import os
import shlex
from subprocess import PIPE, run
from subprocess import run
from tempfile import TemporaryDirectory
from flask import Flask, Response, request, send_from_directory

View File

@ -184,14 +184,19 @@ module = [
]
ignore_missing_imports = true
[tool.pylint.basic]
good-names = ["i", "j", "k", "ex", "Run", "_", "e", "p", "im", "w", "h", "m", "x", "y", "a", "b", "fp", "n", "f", "s", "v", "q", "dx", "dy"]
logging-format-style = "old"
disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "logging-fstring-interpolation", "missing-function-docstring", "too-few-public-methods"]
[tool.ruff]
src = ["src"]
select = ["E"]
select = [
"D", # pydocstyle
"E", # pycodestyle
"W", # pycodestyle
"F", # pyflakes
"I001", # isort
"UP", # pyupgrade
]
target-version = "py38"
[tool.ruff.isort]
known-first-party = ["ocrmypdf"]
[tool.ruff.pydocstyle]
convention = "google"

View File

@ -11,7 +11,6 @@ import os
import signal
import sys
from contextlib import suppress
from multiprocessing import set_start_method
from ocrmypdf import __version__
from ocrmypdf._plugin_manager import get_parser_options_plugins

View File

@ -51,8 +51,7 @@ class Executor(ABC):
task_arguments: Iterable | None = None,
task_finished: Callable | None = None,
) -> None:
"""
Set up parallel execution and progress reporting.
"""Set up parallel execution and progress reporting.
Args:
use_threads: If ``False``, the workload is the sort that will benefit from
@ -73,7 +72,6 @@ class Executor(ABC):
task. This runs in the parent's context, but the parameters must be
marshallable to the worker.
"""
if not task_arguments:
return # Nothing to do!
if not worker_initializer:

View File

@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Manage third party executables"""
"""Manage third party executables."""
from __future__ import annotations

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to Ghostscript executable"""
"""Interface to Ghostscript executable."""
from __future__ import annotations

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to jbig2 executable"""
"""Interface to jbig2 executable."""
from __future__ import annotations

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to pngquant executable"""
"""Interface to pngquant executable."""
from __future__ import annotations

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to Tesseract executable"""
"""Interface to Tesseract executable."""
from __future__ import annotations
@ -54,7 +54,7 @@ TESSERACT_THRESHOLDING_METHODS: dict[str, int] = {
class TesseractLoggerAdapter(logging.LoggerAdapter):
"Prepend [tesseract] to messages emitted from tesseract"
"Prepend [tesseract] to messages emitted from tesseract."
def process(self, msg, kwargs):
kwargs['extra'] = self.extra
@ -283,7 +283,8 @@ def page_timedout(timeout: float) -> None:
def _generate_null_hocr(output_hocr: Path, output_text: Path, image: Path) -> None:
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
the same size as the input image.
"""
with Image.open(image) as im:
w, h = im.size

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Interface to unpaper executable"""
"""Interface to unpaper executable."""
from __future__ import annotations

View File

@ -37,7 +37,6 @@ def _update_resources(*, obj, font, font_key, procset):
obj can be a page or Form XObject.
"""
resources = _ensure_dictionary(obj, Name.Resources)
fonts = _ensure_dictionary(resources, Name.Font)
if font_key is not None and font_key not in fonts:
@ -167,7 +166,6 @@ class OcrGrafter:
the font to page 1 even if page 1 doesn't use it, so we have a way to get it
back.
"""
page0 = self.pdf_base.pages[0]
_update_resources(
obj=page0, font=self.font, font_key=self.font_key, procset=self.procset
@ -200,8 +198,7 @@ class OcrGrafter:
return self.output_file
def _find_font(self, text):
"""Copy a font from the filename text into pdf_base"""
"""Copy a font from the filename text into pdf_base."""
font, font_key = None, None
possible_font_names = ('/f-0-0', '/F1')
try:
@ -234,8 +231,7 @@ class OcrGrafter:
text_rotation: int,
strip_old_text: bool,
):
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
"""Insert the text layer from text page 0 on to pdf_base at page_num."""
# pylint: disable=invalid-name
log.debug("Grafting")

View File

@ -24,7 +24,7 @@ class PageNumberFilter(logging.Filter):
class TqdmConsole:
"""Wrapper to log messages in a way that is compatible with tqdm progress bar
"""Wrapper to log messages in a way that is compatible with tqdm progress bar.
This routes log messages through tqdm so that it can print them above the
progress bar, and then refresh the progress bar, rather than overwriting

View File

@ -121,7 +121,6 @@ def _pdf_guess_version(input_file: Path, search_window=1024) -> str:
Returns empty string if not found, indicating file is probably not PDF.
"""
with open(input_file, 'rb') as f:
signature = f.read(search_window)
m = re.search(br'%PDF-(\d\.\d)', signature)
@ -222,7 +221,7 @@ def _vector_page_dpi(pageinfo: PageInfo) -> int:
def get_page_dpi(pageinfo: PageInfo, options) -> Resolution:
"Get the DPI when nonsquare DPI is tolerable"
"Get the DPI when nonsquare DPI is tolerable."
xres = max(
pageinfo.dpi.x or VECTOR_PAGE_DPI,
options.oversample or 0.0,
@ -237,7 +236,7 @@ def get_page_dpi(pageinfo: PageInfo, options) -> Resolution:
def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution:
"Get the DPI when we require xres == yres, scaled to physical units"
"Get the DPI when we require xres == yres, scaled to physical units."
xres = pageinfo.dpi.x or 0.0
yres = pageinfo.dpi.y or 0.0
userunit = float(pageinfo.userunit) or 1.0
@ -253,7 +252,7 @@ def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution:
def get_canvas_square_dpi(pageinfo: PageInfo, options) -> Resolution:
"""Get the DPI when we require xres == yres, in Postscript units"""
"""Get the DPI when we require xres == yres, in Postscript units."""
units = float(
max(
(pageinfo.dpi.x) or VECTOR_PAGE_DPI,
@ -358,9 +357,7 @@ def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
def describe_rotation(page_context: PageContext, orient_conf, correction: int) -> str:
"""
Describe the page rotation we are going to perform.
"""
"""Describe the page rotation we are going to perform."""
direction = {0: '', 90: '', 180: '', 270: ''}
turns = {0: ' ', 90: '', 180: '', 270: ''}
@ -401,7 +398,6 @@ def get_orientation_correction(preview: Path, page_context: PageContext) -> int:
which points it (hopefully) upright. _graft.py takes care of the orienting
the image and text layers.
"""
orient_conf = page_context.plugin_manager.hook.get_ocr_engine().get_orientation(
preview, page_context.options
)
@ -514,10 +510,11 @@ def preprocess_clean(input_file: Path, page_context: PageContext) -> Path:
def create_ocr_image(image: Path, page_context: PageContext) -> Path:
"""Create the image we send for OCR. May not be the same as the display
image depending on preprocessing. This image will never be shown to the
user."""
"""Create the image we send for OCR.
Might not be the same as the display image depending on preprocessing.
This image will never be shown to the user.
"""
output_file = page_context.get_path('ocr.png')
options = page_context.options
with Image.open(image) as im:

View File

@ -251,8 +251,7 @@ def worker_init(max_pixels: int) -> None:
def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
"""Execute the pipeline concurrently"""
"""Execute the pipeline concurrently."""
# Run exec_page_sync on every page context
options = context.options
max_workers = min(len(context.pdfinfo), options.jobs)
@ -316,8 +315,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
def configure_debug_logging(
log_filename: Path, prefix: str = ''
) -> logging.FileHandler:
"""
Create a debug log file at a specified location.
"""Create a debug log file at a specified location.
Arguments:
log_filename: Where to the put the log file.

View File

@ -86,7 +86,6 @@ def configure_logging(
Returns:
The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
"""
prefix = '' if manage_root_logger else 'ocrmypdf'
log = logging.getLogger(prefix)
@ -277,6 +276,8 @@ def ocr( # pylint: disable=unused-argument
When a stream is used as output, whether via a writable object or
``"-"``, some final validation steps are not performed (we do not read
back the stream after it is written).
Raises:
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
was not found on PATH.

View File

@ -30,7 +30,7 @@ WorkerInit = Callable[[Queue, UserInit, int], None]
def log_listener(q: Queue):
"""Listen to the worker processes and forward the messages to logging
"""Listen to the worker processes and forward the messages to logging.
For simplicity this is a thread rather than a process. Only one process
should actually write to sys.stderr or whatever we're using, so if this is
@ -39,7 +39,6 @@ def log_listener(q: Queue):
See:
https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
"""
while True:
try:
record = q.get()
@ -59,8 +58,7 @@ def process_sigbus(*args):
def process_init(q: Queue, user_init: UserInit, loglevel) -> None:
"""Initialize a process pool worker"""
"""Initialize a process pool worker."""
# Ignore SIGINT (our parent process will kill us gracefully)
signal.signal(signal.SIGINT, signal.SIG_IGN)

View File

@ -8,7 +8,6 @@ import logging
from ocrmypdf import hookimpl
from ocrmypdf._exec import ghostscript
from ocrmypdf._validation import HOCR_OK_LANGS
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import check_external_program

View File

@ -15,7 +15,7 @@ T = TypeVar('T', int, float)
def numeric(basetype: Callable[[Any], T], min_: T | None = None, max_: T | None = None):
"""Validator for numeric params"""
"""Validator for numeric params."""
min_ = basetype(min_) if min_ is not None else None
max_ = basetype(max_) if max_ is not None else None
@ -46,7 +46,7 @@ def str_to_int(mapping: Mapping[str, int]):
class ArgumentParser(argparse.ArgumentParser):
"""Override parser's default behavior of calling sys.exit()
"""Override parser's default behavior of calling sys.exit().
https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code

View File

@ -73,8 +73,7 @@ class ConnectionLogHandler(logging.handlers.QueueHandler):
def process_loop(
conn: Connection, user_init: Callable[[], None], loglevel, task, task_args
):
"""Initialize a process pool worker"""
"""Initialize a process pool worker."""
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
# Windows and Cygwin do not have pthread_sigmask or SIGBUS

View File

@ -108,7 +108,7 @@ class Resolution(Generic[T]):
class NeverRaise(Exception):
"""An exception that is never raised"""
"""An exception that is never raised."""
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
@ -170,7 +170,7 @@ def monotonic(seq: Sequence) -> bool:
def page_number(input_file: os.PathLike) -> int:
"""Get one-based page number implied by filename (000002.pdf -> 2)"""
"""Get one-based page number implied by filename (000002.pdf -> 2)."""
return int(os.path.basename(os.fspath(input_file))[0:6])

View File

@ -99,11 +99,9 @@ class HocrTransformError(Exception):
class HocrTransform:
"""
A class for converting documents from the hOCR format.
"""A class for converting documents from the hOCR format.
For details of the hOCR format, see:
http://kba.cloud/hocr-spec/
http://kba.cloud/hocr-spec/.
"""
box_pattern = re.compile(r'bbox((\s+\d+){4})')
@ -143,9 +141,7 @@ class HocrTransform:
raise HocrTransformError("hocr file is missing page dimensions")
def __str__(self): # pragma: no cover
"""
Return the textual content of the HTML body
"""
"""Return the textual content of the HTML body."""
if self.hocr is None:
return ''
body = self.hocr.find(self._child_xpath('body'))
@ -155,9 +151,7 @@ class HocrTransform:
return ''
def _get_element_text(self, element: Element):
"""
Return the textual content of the element and its children
"""
"""Return the textual content of the element and its children."""
text = ''
if element.text is not None:
text += element.text
@ -169,10 +163,7 @@ class HocrTransform:
@classmethod
def element_coordinates(cls, element: Element) -> Rect:
"""
Returns a tuple containing the coordinates of the bounding box around
an element
"""
"""Get coordinates of the bounding box around an element."""
out = Rect._make(0 for _ in range(4))
if 'title' in element.attrib:
matches = cls.box_pattern.search(element.attrib['title'])
@ -183,9 +174,7 @@ class HocrTransform:
@classmethod
def baseline(cls, element: Element) -> tuple[float, float]:
"""
Returns a tuple containing the baseline slope and intercept.
"""
"""Get baseline's slope and intercept."""
if 'title' in element.attrib:
matches = cls.baseline_pattern.search(element.attrib['title'])
if matches:
@ -193,9 +182,7 @@ class HocrTransform:
return (0.0, 0.0)
def pt_from_pixel(self, pxl) -> Rect:
"""
Returns the quantity in PDF units (pt) given quantity in pixels
"""
"""Returns the quantity in PDF units (pt) given quantity in pixels."""
return Rect._make((c / self.dpi * inch) for c in pxl)
def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
@ -206,11 +193,7 @@ class HocrTransform:
@classmethod
def replace_unsupported_chars(cls, s: str) -> str:
"""
Given an input string, returns the corresponding string that:
* is available in the Helvetica facetype
* does not contain any ligature (to allow easy search in the PDF file)
"""
"""Replaces characters with those available in the Helvetica typeface."""
return s.translate(cls.ligatures)
def topdown_position(self, element):
@ -231,8 +214,8 @@ class HocrTransform:
invisible_text: bool = False,
interword_spaces: bool = False,
) -> None:
"""
Creates a PDF file with an image superimposed on top of the text.
"""Creates a PDF file with an image superimposed on top of the text.
Text is positioned according to the bounding box of the lines in
the hOCR file.
The image need not be identical to the image used to create the hOCR

View File

@ -230,7 +230,7 @@ def extract_images(
options,
extract_fn: Callable[..., XrefExt | None],
) -> Iterator[tuple[int, XrefExt]]:
"""Extract image using extract_fn
"""Extract image using extract_fn.
Enumerate images on each page, lookup their xref/ID number in the PDF.
Exclude images that are soft masks (i.e. alpha transparency related).
@ -244,7 +244,6 @@ def extract_images(
it does a tuple should be returned: (xref, ext) where .ext is the file
extension. extract_fn must also extract the file it finds interesting.
"""
include_xrefs: MutableSet[Xref] = set()
exclude_xrefs: MutableSet[Xref] = set()
pageno_for_xref = {}
@ -289,8 +288,7 @@ def extract_images(
def extract_images_generic(
pike: Pdf, root: Path, options
) -> tuple[list[Xref], list[Xref]]:
"""Extract any >=2bpp image we think we can improve"""
"""Extract any >=2bpp image we think we can improve."""
jpegs = []
pngs = []
for _, xref_ext in extract_images(pike, root, options, extract_image_generic):
@ -304,8 +302,7 @@ def extract_images_generic(
def extract_images_jbig2(pike: Pdf, root: Path, options) -> dict[int, list[XrefExt]]:
"""Extract any bitonal image that we think we can improve as JBIG2"""
"""Extract any bitonal image that we think we can improve as JBIG2."""
jbig2_groups = defaultdict(list)
for pageno, xref_ext in extract_images(pike, root, options, extract_image_jbig2):
group = pageno // options.jbig2_page_group_size
@ -318,7 +315,7 @@ def extract_images_jbig2(pike: Pdf, root: Path, options) -> dict[int, list[XrefE
def _produce_jbig2_images(
jbig2_groups: dict[int, list[XrefExt]], root: Path, options, executor: Executor
) -> None:
"""Produce JBIG2 images from their groups"""
"""Produce JBIG2 images from their groups."""
def jbig2_group_args(root: Path, groups: dict[int, list[XrefExt]]):
for group, xref_exts in groups.items():
@ -674,7 +671,7 @@ def main(infile, outfile, level, jobs=1):
from tempfile import TemporaryDirectory # pylint: disable=import-outside-toplevel
class OptimizeOptions:
"""Emulate ocrmypdf's options"""
"""Emulate ocrmypdf's options."""
def __init__(
self, input_file, jobs, optimize_, jpeg_quality, png_quality, jb2lossy

View File

@ -1,9 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""
Utilities for PDF/A production and confirmation with Ghostspcript.
"""
"""Utilities for PDF/A production and confirmation with Ghostspcript."""
from __future__ import annotations
@ -75,7 +73,7 @@ def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[st
def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
"""Create a Postscript PDFMARK file for Ghostscript PDF/A conversion
"""Create a Postscript PDFMARK file for Ghostscript PDF/A conversion.
pdfmark is an extension to the Postscript language that describes some PDF
features like bookmarks and annotations. It was originally specified Adobe
@ -118,7 +116,6 @@ def file_claims_pdfa(filename: Path):
This only checks if the XMP metadata contains a PDF/A marker. It does not
do full PDF/A validation.
"""
with pikepdf.open(filename) as pdf:
pdfmeta = pdf.open_metadata()
if not pdfmeta.pdfa_status:

View File

@ -164,7 +164,7 @@ class TextMarker:
def _normalize_stack(graphobjs):
"""Convert runs of qQ's in the stack into single graphobjs"""
"""Convert runs of qQ's in the stack into single graphobjs."""
for operands, operator in graphobjs:
operator = str(operator)
if re.match(r'Q*q+$', operator): # Zero or more Q, one or more q
@ -200,7 +200,6 @@ def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):
undefined in the spec, but we just pretend nothing happened and leave the
CTM unchanged.
"""
stack = []
ctm = PdfMatrix(initial_shorthand)
xobject_settings: list[XobjectSettings] = []
@ -307,7 +306,6 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution:
/MediaBox.
"""
a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name
# Calculate the width and height of the image in PDF units
@ -451,8 +449,7 @@ class ImageInfo:
def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
"Find inline images in the contentstream"
"Find inline images in the contentstream."
for n, inline in enumerate(contentsinfo.inline_images):
yield ImageInfo(
name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage
@ -460,7 +457,7 @@ def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
def _image_xobjects(container) -> Iterator[tuple[Object, str]]:
"""Search for all XObject-based images in the container
"""Search for all XObject-based images in the container.
Usually the container is a page, but it could also be a Form XObject
that contains images. Filter out the Form XObjects which are dealt with
@ -471,7 +468,6 @@ def _image_xobjects(container) -> Iterator[tuple[Object, str]]:
since the object does not know its own name.
"""
if '/Resources' not in container:
return
resources = container['/Resources']
@ -488,14 +484,13 @@ def _image_xobjects(container) -> Iterator[tuple[Object, str]]:
def _find_regular_images(
container: Object, contentsinfo: ContentsInfo
) -> Iterator[ImageInfo]:
"""Find images stored in the container's /Resources /XObject
"""Find images stored in the container's /Resources /XObject.
Usually the container is a page, but it could also be a Form XObject
that contains images.
Generates images with their DPI at time of drawing.
"""
for pdfimage, xobj in _image_xobjects(container):
if xobj not in contentsinfo.name_index:
continue
@ -512,7 +507,7 @@ def _find_regular_images(
def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: ContentsInfo):
"""Find any images that are in Form XObjects in the container
"""Find any images that are in Form XObjects in the container.
The container may be a page, or a parent Form XObject.
@ -546,7 +541,7 @@ def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: Content
def _process_content_streams(
*, pdf: Pdf, container: Object, shorthand=None
) -> Iterator[VectorMarker | TextMarker | ImageInfo]:
"""Find all individual instances of images drawn in the container
"""Find all individual instances of images drawn in the container.
Usually the container is a page, but it may also be a Form XObject.
@ -563,7 +558,6 @@ def _process_content_streams(
downsampling.
"""
if container.get('/Type') == '/Page' and '/Contents' in container:
initial_shorthand = shorthand or UNIT_SQUARE
elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
@ -595,8 +589,7 @@ def _process_content_streams(
def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:
"""Smarter text detection that ignores text in margins"""
"""Smarter text detection that ignores text in margins."""
pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name
margin_ratio = 0.125
@ -608,10 +601,9 @@ def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) ->
)
def rects_intersect(a: FloatRect, b: FloatRect) -> bool:
"""
Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
"""Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
Formula assumes all boxes are in first quadrant
Formula assumes all boxes are in first quadrant.
"""
return a[0] < b[2] and a[2] > b[0] and a[1] > b[3] and a[3] < b[1]
@ -624,7 +616,7 @@ def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) ->
def simplify_textboxes(miner, textbox_getter) -> Iterator[TextboxInfo]:
"""Extract only limited content from text boxes
"""Extract only limited content from text boxes.
We do this to save memory and ensure that our objects are pickleable.
"""
@ -910,7 +902,7 @@ DEFAULT_EXECUTOR = SerialExecutor()
class PdfInfo:
"""Get summary information about a PDF"""
"""Get summary information about a PDF."""
def __init__(
self,

View File

@ -63,7 +63,7 @@ def pdftype3font__pscript5_get_ascent(self):
class LTStateAwareChar(LTChar):
"""A subclass of LTChar that tracks text render mode at time of drawing"""
"""A subclass of LTChar that tracks text render mode at time of drawing."""
__slots__ = (
'rendermode',
@ -111,7 +111,7 @@ class LTStateAwareChar(LTChar):
self.rendermode = textstate.render
def is_compatible(self, obj):
"""Check if characters can be combined into a textline
"""Check if characters can be combined into a textline.
We consider characters compatible if:
- the Unicode mapping is known, and both have the same render mode
@ -146,7 +146,7 @@ class LTStateAwareChar(LTChar):
class TextPositionTracker(PDFLayoutAnalyzer):
"""A page layout analyzer that pays attention to text visibility"""
"""A page layout analyzer that pays attention to text visibility."""
def __init__(self, rsrcmgr, pageno=1, laparams=None):
super().__init__(rsrcmgr, pageno, laparams)

View File

@ -99,6 +99,8 @@ def check_options(options: Namespace) -> None:
ocrmypdf.exceptions.ExitCodeException: If options are not acceptable
and the application should terminate gracefully with an informative
message and error code.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
@ -127,6 +129,8 @@ def get_executor(progressbar_class) -> Executor:
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@ -159,7 +163,6 @@ def get_progressbar_class():
Here is how OCRmyPDF will use the progress bar:
Example:
pbar_class = pm.hook.get_progressbar_class()
with pbar_class(**tqdm_kwargs) as pbar:
...
@ -181,6 +184,8 @@ def validate(pdfinfo: PdfInfo, options: Namespace) -> None:
ocrmypdf.exceptions.ExitCodeException: If options or pdfinfo are not acceptable
and the application should terminate gracefully with an informative
message and error code.
Note:
This hook will be called from the main process, and may modify global state
before child worker processes are forked.
@ -218,6 +223,8 @@ def rasterize_pdf_page(
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@ -245,6 +252,8 @@ def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@ -281,6 +290,8 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path:
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@ -323,6 +334,8 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@ -381,7 +394,8 @@ class OcrEngine(ABC):
"""Returns the set of all languages that are supported by the engine.
Languages are typically given in 3-letter ISO 3166-1 codes, but actually
can be any value understood by the OCR engine."""
can be any value understood by the OCR engine.
"""
@staticmethod
@abstractmethod
@ -474,7 +488,7 @@ def generate_pdfa(
Note:
This is a :ref:`firstresult hook<firstresult>`.
See also:
See Also:
https://github.com/tqdm/tqdm
"""

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Utilities to measure OCR quality"""
"""Utilities to measure OCR quality."""
from __future__ import annotations

View File

@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Wrappers to manage subprocess calls"""
"""Wrappers to manage subprocess calls."""
from __future__ import annotations
@ -34,7 +34,7 @@ def run(
check: bool = False,
**kwargs,
) -> CompletedProcess:
"""Wrapper around :py:func:`subprocess.run`
"""Wrapper around :py:func:`subprocess.run`.
The main purpose of this wrapper is to log subprocess output in an orderly
fashion that identifies the responsible subprocess. An additional
@ -141,7 +141,7 @@ def get_version(
regex=r'(\d+(\.\d+)*)',
env: OsEnviron | None = None,
) -> str:
"""Get the version of the specified program
"""Get the version of the specified program.
Arguments:
program: The program to version check.
@ -323,7 +323,6 @@ def check_external_program(
version_parser: A class that should be used to parse and compare version
numbers. Used when version numbers do not follow standard conventions.
"""
try:
found_version = version_checker()
except (CalledProcessError, FileNotFoundError) as e:

View File

@ -169,8 +169,7 @@ SHIMS = [
def fix_windows_args(program: str, args, env):
"""Adjust our desired program and command line arguments for use on Windows"""
"""Adjust our desired program and command line arguments for use on Windows."""
# If we are running a .py on Windows, ensure we call it with this Python
# (to support test suite shims)
if program.lower().endswith('.py'):

View File

@ -3,12 +3,10 @@
from __future__ import annotations
import os
import platform
import sys
from pathlib import Path
from subprocess import PIPE, CompletedProcess, run
from typing import List
from subprocess import CompletedProcess, run
import pytest
@ -71,10 +69,13 @@ def outtxt(tmp_path) -> Path:
@pytest.fixture(scope="function")
def no_outpdf(tmp_path) -> Path:
"""This just documents the fact that a test is not expected to produce
"""Document fact that a test is not expected to produce output.
This just documents the fact that a test is not expected to produce
output. Unfortunately an assertion failure inside a test fixture produces
an error rather than a test failure, so no testing is done. It's up to
the test to confirm that no output file was created."""
the test to confirm that no output file was created.
"""
return tmp_path / 'no_output.pdf'
@ -110,7 +111,6 @@ def run_ocrmypdf_api(input_file: Path, output_file: Path, *args) -> ExitCode:
The return code must always be checked or the test may declare a failure
to be pass.
"""
api_args = [str(input_file), str(output_file)] + [
str(arg) for arg in args if arg is not None
]
@ -128,7 +128,6 @@ def run_ocrmypdf(
If an exception is thrown this fact will be returned as part of the result
text and return code rather than exception objects.
"""
p_args = (
[sys.executable, '-m', 'ocrmypdf']
+ [str(arg) for arg in args if arg is not None]

View File

@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract bad utf8
"""Tesseract bad utf8.
In some cases, some versions of Tesseract can output binary gibberish or data
that is not UTF-8 compatible, so we are forced to check that we can convert it

View File

@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Cache output of tesseract to speed up test suite
"""Cache output of tesseract to speed up test suite.
The cache is keyed by by the input test file The input arguments are slugged
into a hideous filename that more or less represents them literally. Joined

View File

@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op/fixed rotate plugin
"""Tesseract no-op/fixed rotate plugin.
To quickly run tests where getting OCR output is not necessary and we want to test
the rotation pipeline.

View File

@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op plugin
"""Tesseract no-op plugin.
To quickly run tests where getting OCR output is not necessary.

View File

@ -3,8 +3,6 @@
from __future__ import annotations
import pytest
from ocrmypdf.helpers import check_pdf

View File

@ -4,7 +4,7 @@
from __future__ import annotations
import os
from subprocess import PIPE, run
from subprocess import run
import pytest

View File

@ -6,7 +6,6 @@ from __future__ import annotations
from unittest.mock import patch
import pikepdf
import pytest
import ocrmypdf

View File

@ -5,8 +5,6 @@ from __future__ import annotations
import logging
import pytest
from ocrmypdf._sync import configure_debug_logging

View File

@ -446,7 +446,7 @@ def test_linearized_pdf_and_indirect_object(resources, outpdf):
def test_very_high_dpi(resources, outpdf):
"Checks for a Decimal quantize error with high DPI, etc"
"Checks for a Decimal quantize error with high DPI, etc."
check_ocrmypdf(
resources / '2400dpi.pdf',
outpdf,

View File

@ -231,17 +231,6 @@ def test_xml_metadata_preserved(
'pdf:keywords',
]
acquired_properties = ['dc:format']
might_change_properties = [
'dc:date',
'pdf:pdfversion',
'pdf:Producer',
'xmp:CreateDate',
'xmp:ModifyDate',
'xmp:MetadataDate',
'xmp:CreatorTool',
'xmpMM:DocumentId',
'xmpMM:DnstanceId',
]
# Cleanup messy data structure
# Top level is key-value mapping of namespaces to keys under namespace,

View File

@ -3,8 +3,6 @@
from __future__ import annotations
import pytest
from ocrmypdf import quality as qual

View File

@ -11,12 +11,13 @@ from os import fspath
import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageChops
from reportlab.pdfgen.canvas import Canvas
from ocrmypdf._exec import ghostscript
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.pdfinfo import PdfInfo
from PIL import Image, ImageChops
from reportlab.pdfgen.canvas import Canvas
from .conftest import check_ocrmypdf, run_ocrmypdf
@ -152,13 +153,14 @@ def test_autorotate_threshold(threshold, op, comparison_threshold, resources, ou
def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
"""Check rotated skew timeout.
This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
in_pageinfo = PdfInfo(input_file)[0]

View File

@ -4,13 +4,10 @@
from __future__ import annotations
import os
import sys
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen, run
from subprocess import DEVNULL, PIPE, run
import pytest
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import check_pdf
from .conftest import run_ocrmypdf

View File

@ -8,14 +8,13 @@ from os import fspath
from unittest.mock import patch
import pytest
from PIL import Image
from ocrmypdf._exec import unpaper
from ocrmypdf._plugin_manager import get_parser_options_plugins
from ocrmypdf._validation import check_options
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from .conftest import check_ocrmypdf, have_unpaper, ocrmypdf_exec, run_ocrmypdf
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf
# pylint: disable=redefined-outer-name