docs: improve API docs

2025-12-30 00:31:59 +00:00 · 2021-01-26 01:40:40 -08:00 · 2021-01-26 01:40:40 -08:00 · 1084724937
commit 1084724937
parent ecb0109d79
6 changed files with 85 additions and 14 deletions
--- a/docs/apiref.rst
+++ b/docs/apiref.rst
@ -5,6 +5,15 @@ API Reference
 This page summarizes the rest of the public API. Generally speaking this
 should mainly of interest to plugin developers.

+ocrmypdf
+========
+
+.. autoclass:: ocrmypdf.PageContext
+    :members:
+
+.. autoclass:: ocrmypdf.PdfContext
+    :members:
+
 ocrmypdf.exceptions
 ===================

@ -17,6 +26,9 @@ ocrmypdf.helpers

 .. automodule:: ocrmypdf.helpers
    :members:
+    :noindex: deprecated
+
+    .. autodecorator:: deprecated

 ocrmypdf.hocrtransform
 ======================
--- a/docs/plugins.rst
+++ b/docs/plugins.rst
@ -65,6 +65,27 @@ similar to ``pytest`` packages such as ``pytest-cov`` (the package) and
    ``ocrmypdf-`` (for the package name on PyPI) and ``ocrmypdf_`` (for the
    module), just like pytest plugins.

+Setuptools plugins
+==================
+
+You can also create a plugin that OCRmyPDF will always automatically load if both are
+installed in the same virtual environment, using a setuptools entrypoint.
+
+Your package's ``setup.py`` would need to contain the following, for a plugin
+named ``ocrmypdf-exampleplugin``:
+
+.. code-block:: python
+
+    # sample ./setup.py file
+    from setuptools import setup
+
+    setup(
+        name="ocrmypdf-exampleplugin",
+        packages=["exampleplugin"],
+        # the following makes a plugin available to pytest
+        entry_points={"ocrmypdf": ["exampleplugin = exampleplugin.pluginmodule"]},
+    )
+
 Plugin requirements
 ===================

--- a/src/ocrmypdf/_jobcontext.py
+++ b/src/ocrmypdf/_jobcontext.py
@ -14,11 +14,19 @@ from io import IOBase
 from pathlib import Path
 from typing import Iterator

+from pluggy import PluginManager
+
 from ocrmypdf.pdfinfo import PdfInfo
+from ocrmypdf.pdfinfo.info import PageInfo


 class PdfContext:
-    """Holds our context for a particular run of the pipeline"""
+    """Holds the context for a particular run of the pipeline."""
+
+    options: Namespace  #: The specified options for processing this PDF.
+    origin: Path  #: The filename of the original input file.
+    pdfinfo: PdfInfo  #: Detailed data for this PDF.
+    plugin_manager: PluginManager  #: PluginManager for processing the current PDF.

    def __init__(
        self,
@ -35,21 +43,33 @@ class PdfContext:
        self.plugin_manager = plugin_manager

    def get_path(self, name: str) -> Path:
+        """Generate a ``Path`` for an intermediate file involved in processing.
+
+        The path will be in a temporary folder that is common for all processing
+        of this particular PDF.
+        """
        return self.work_folder / name

    def get_page_contexts(self) -> Iterator['PageContext']:
+        """Get all ``PageContext`` for this PDF."""
        npages = len(self.pdfinfo)
        for n in range(npages):
            yield PageContext(self, n)


 class PageContext:
-    """Holds our context for a page
+    """Holds our context for a page.

    Must be pickable, so stores only intrinsic/simple data elements or those
-    capable of their serializing themselves via __getstate__.
+    capable of their serializing themselves via ``__getstate__``.
    """

+    options: Namespace  #: The specified options for processing this PDF.
+    origin: Path  #: The filename of the original input file.
+    pageno: int  #: This page number (zero-based).
+    pageinfo: PageInfo  #: Information on this page.
+    plugin_manager: PluginManager  #: PluginManager for processing the current PDF.
+
    def __init__(self, pdf_context: PdfContext, pageno):
        self.work_folder = pdf_context.work_folder
        self.origin = pdf_context.origin
@ -59,6 +79,11 @@ class PageContext:
        self.plugin_manager = pdf_context.plugin_manager

    def get_path(self, name: str) -> Path:
+        """Generate a ``Path`` for a file that is part of processing this page.
+
+        The path will be based in a common temporary folder and have a prefix based
+        on the page number.
+        """
        return self.work_folder / ("%06d_%s" % (self.pageno + 1, name))

    def __getstate__(self):
--- a/src/ocrmypdf/helpers.py
+++ b/src/ocrmypdf/helpers.py
@ -63,8 +63,13 @@ class NeverRaise(Exception):


 def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
-    """
-    Helper function: relinks soft symbolic link if necessary
+    """Create a symbolic link at ``soft_link_name``, which references ``input_file``.
+
+    Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
+
+    Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
+    used since symlinks may require administrator privileges. An existing link at the
+    destination is removed.
    """
    input_file = os.fspath(input_file)
    soft_link_name = os.fspath(soft_link_name)
@ -72,8 +77,8 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        log.warning(
-            "No symbolic link made. You are using "
-            "the original data directory as the working directory."
+            "No symbolic link created. You are using  the original data directory "
+            "as the working directory."
        )
        return

@ -114,7 +119,7 @@ def is_iterable_notstr(thing: Any) -> bool:


 def monotonic(L: Sequence) -> bool:
-    """Does list increase monotonically?"""
+    """Does this sequence increase monotonically?"""
    return all(b > a for a, b in zip(L, L[1:]))


@ -173,7 +178,8 @@ def is_file_writable(test_file: os.PathLike) -> bool:
 def check_pdf(input_file: Path) -> bool:
    """Check if a PDF complies with the PDF specification.

-    Checks for proper formatting and proper linearization.
+    Checks for proper formatting and proper linearization. Uses pikepdf (which in
+    turn, uses QPDF) to perform the checks.
    """
    pdf = None
    try:
@ -217,7 +223,7 @@ def check_pdf(input_file: Path) -> bool:


 def clamp(n, smallest, largest):  # mypy doesn't understand types for this
-    """Clamps the value of n to between smallest and largest."""
+    """Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
    return max(smallest, min(n, largest))


@ -235,7 +241,7 @@ def pikepdf_enable_mmap():


 def deprecated(func):
-    """Warn that function is deprecated"""
+    """Warn that function is deprecated."""

    @wraps(func)
    def new_func(*args, **kwargs):
--- a/src/ocrmypdf/hocrtransform.py
+++ b/src/ocrmypdf/hocrtransform.py
@ -35,7 +35,7 @@ from collections import namedtuple
 from itertools import chain
 from math import atan, cos, sin
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import Any, NamedTuple, Optional, Tuple, Union
 from xml.etree import ElementTree

 from reportlab.lib.colors import black, cyan, magenta, red
@ -44,7 +44,14 @@ from reportlab.pdfgen.canvas import Canvas

 Element = ElementTree.Element

-Rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
+
+class Rect(NamedTuple):
+    """A rectangle for managing PDF coordinates."""
+
+    x1: Any
+    y1: Any
+    x2: Any
+    y2: Any


 class HocrTransformError(Exception):
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@ -112,7 +112,7 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):


 def file_claims_pdfa(filename: Path):
-    """Determines if the file claims to be PDF/A compliant
+    """Determines if the file claims to be PDF/A compliant.

    This only checks if the XMP metadata contains a PDF/A marker. It does not
    do full PDF/A validation.