docs: improve API docs

This commit is contained in:
James R. Barlow 2021-01-26 01:40:40 -08:00
parent ecb0109d79
commit 1084724937
No known key found for this signature in database
GPG Key ID: E54A300D567E1260
6 changed files with 85 additions and 14 deletions

View File

@ -5,6 +5,15 @@ API Reference
This page summarizes the rest of the public API. Generally speaking this
should mainly of interest to plugin developers.
ocrmypdf
========
.. autoclass:: ocrmypdf.PageContext
:members:
.. autoclass:: ocrmypdf.PdfContext
:members:
ocrmypdf.exceptions
===================
@ -17,6 +26,9 @@ ocrmypdf.helpers
.. automodule:: ocrmypdf.helpers
:members:
:noindex: deprecated
.. autodecorator:: deprecated
ocrmypdf.hocrtransform
======================

View File

@ -65,6 +65,27 @@ similar to ``pytest`` packages such as ``pytest-cov`` (the package) and
``ocrmypdf-`` (for the package name on PyPI) and ``ocrmypdf_`` (for the
module), just like pytest plugins.
Setuptools plugins
==================
You can also create a plugin that OCRmyPDF will always automatically load if both are
installed in the same virtual environment, using a setuptools entrypoint.
Your package's ``setup.py`` would need to contain the following, for a plugin
named ``ocrmypdf-exampleplugin``:
.. code-block:: python
# sample ./setup.py file
from setuptools import setup
setup(
name="ocrmypdf-exampleplugin",
packages=["exampleplugin"],
# the following makes a plugin available to pytest
entry_points={"ocrmypdf": ["exampleplugin = exampleplugin.pluginmodule"]},
)
Plugin requirements
===================

View File

@ -14,11 +14,19 @@ from io import IOBase
from pathlib import Path
from typing import Iterator
from pluggy import PluginManager
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pdfinfo.info import PageInfo
class PdfContext:
"""Holds our context for a particular run of the pipeline"""
"""Holds the context for a particular run of the pipeline."""
options: Namespace #: The specified options for processing this PDF.
origin: Path #: The filename of the original input file.
pdfinfo: PdfInfo #: Detailed data for this PDF.
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
def __init__(
self,
@ -35,21 +43,33 @@ class PdfContext:
self.plugin_manager = plugin_manager
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for an intermediate file involved in processing.
The path will be in a temporary folder that is common for all processing
of this particular PDF.
"""
return self.work_folder / name
def get_page_contexts(self) -> Iterator['PageContext']:
"""Get all ``PageContext`` for this PDF."""
npages = len(self.pdfinfo)
for n in range(npages):
yield PageContext(self, n)
class PageContext:
"""Holds our context for a page
"""Holds our context for a page.
Must be pickable, so stores only intrinsic/simple data elements or those
capable of their serializing themselves via __getstate__.
capable of their serializing themselves via ``__getstate__``.
"""
options: Namespace #: The specified options for processing this PDF.
origin: Path #: The filename of the original input file.
pageno: int #: This page number (zero-based).
pageinfo: PageInfo #: Information on this page.
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
def __init__(self, pdf_context: PdfContext, pageno):
self.work_folder = pdf_context.work_folder
self.origin = pdf_context.origin
@ -59,6 +79,11 @@ class PageContext:
self.plugin_manager = pdf_context.plugin_manager
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for a file that is part of processing this page.
The path will be based in a common temporary folder and have a prefix based
on the page number.
"""
return self.work_folder / ("%06d_%s" % (self.pageno + 1, name))
def __getstate__(self):

View File

@ -63,8 +63,13 @@ class NeverRaise(Exception):
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
"""
Helper function: relinks soft symbolic link if necessary
"""Create a symbolic link at ``soft_link_name``, which references ``input_file``.
Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
used since symlinks may require administrator privileges. An existing link at the
destination is removed.
"""
input_file = os.fspath(input_file)
soft_link_name = os.fspath(soft_link_name)
@ -72,8 +77,8 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
# Guard against soft linking to oneself
if input_file == soft_link_name:
log.warning(
"No symbolic link made. You are using "
"the original data directory as the working directory."
"No symbolic link created. You are using the original data directory "
"as the working directory."
)
return
@ -114,7 +119,7 @@ def is_iterable_notstr(thing: Any) -> bool:
def monotonic(L: Sequence) -> bool:
"""Does list increase monotonically?"""
"""Does this sequence increase monotonically?"""
return all(b > a for a, b in zip(L, L[1:]))
@ -173,7 +178,8 @@ def is_file_writable(test_file: os.PathLike) -> bool:
def check_pdf(input_file: Path) -> bool:
"""Check if a PDF complies with the PDF specification.
Checks for proper formatting and proper linearization.
Checks for proper formatting and proper linearization. Uses pikepdf (which in
turn, uses QPDF) to perform the checks.
"""
pdf = None
try:
@ -217,7 +223,7 @@ def check_pdf(input_file: Path) -> bool:
def clamp(n, smallest, largest): # mypy doesn't understand types for this
"""Clamps the value of n to between smallest and largest."""
"""Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
return max(smallest, min(n, largest))
@ -235,7 +241,7 @@ def pikepdf_enable_mmap():
def deprecated(func):
"""Warn that function is deprecated"""
"""Warn that function is deprecated."""
@wraps(func)
def new_func(*args, **kwargs):

View File

@ -35,7 +35,7 @@ from collections import namedtuple
from itertools import chain
from math import atan, cos, sin
from pathlib import Path
from typing import Optional, Tuple, Union
from typing import Any, NamedTuple, Optional, Tuple, Union
from xml.etree import ElementTree
from reportlab.lib.colors import black, cyan, magenta, red
@ -44,7 +44,14 @@ from reportlab.pdfgen.canvas import Canvas
Element = ElementTree.Element
Rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
class Rect(NamedTuple):
"""A rectangle for managing PDF coordinates."""
x1: Any
y1: Any
x2: Any
y2: Any
class HocrTransformError(Exception):

View File

@ -112,7 +112,7 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
def file_claims_pdfa(filename: Path):
"""Determines if the file claims to be PDF/A compliant
"""Determines if the file claims to be PDF/A compliant.
This only checks if the XMP metadata contains a PDF/A marker. It does not
do full PDF/A validation.