mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-30 00:31:59 +00:00
docs: improve API docs
This commit is contained in:
parent
ecb0109d79
commit
1084724937
@ -5,6 +5,15 @@ API Reference
|
||||
This page summarizes the rest of the public API. Generally speaking this
|
||||
should mainly of interest to plugin developers.
|
||||
|
||||
ocrmypdf
|
||||
========
|
||||
|
||||
.. autoclass:: ocrmypdf.PageContext
|
||||
:members:
|
||||
|
||||
.. autoclass:: ocrmypdf.PdfContext
|
||||
:members:
|
||||
|
||||
ocrmypdf.exceptions
|
||||
===================
|
||||
|
||||
@ -17,6 +26,9 @@ ocrmypdf.helpers
|
||||
|
||||
.. automodule:: ocrmypdf.helpers
|
||||
:members:
|
||||
:noindex: deprecated
|
||||
|
||||
.. autodecorator:: deprecated
|
||||
|
||||
ocrmypdf.hocrtransform
|
||||
======================
|
||||
|
||||
@ -65,6 +65,27 @@ similar to ``pytest`` packages such as ``pytest-cov`` (the package) and
|
||||
``ocrmypdf-`` (for the package name on PyPI) and ``ocrmypdf_`` (for the
|
||||
module), just like pytest plugins.
|
||||
|
||||
Setuptools plugins
|
||||
==================
|
||||
|
||||
You can also create a plugin that OCRmyPDF will always automatically load if both are
|
||||
installed in the same virtual environment, using a setuptools entrypoint.
|
||||
|
||||
Your package's ``setup.py`` would need to contain the following, for a plugin
|
||||
named ``ocrmypdf-exampleplugin``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# sample ./setup.py file
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="ocrmypdf-exampleplugin",
|
||||
packages=["exampleplugin"],
|
||||
# the following makes a plugin available to pytest
|
||||
entry_points={"ocrmypdf": ["exampleplugin = exampleplugin.pluginmodule"]},
|
||||
)
|
||||
|
||||
Plugin requirements
|
||||
===================
|
||||
|
||||
|
||||
@ -14,11 +14,19 @@ from io import IOBase
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from pluggy import PluginManager
|
||||
|
||||
from ocrmypdf.pdfinfo import PdfInfo
|
||||
from ocrmypdf.pdfinfo.info import PageInfo
|
||||
|
||||
|
||||
class PdfContext:
|
||||
"""Holds our context for a particular run of the pipeline"""
|
||||
"""Holds the context for a particular run of the pipeline."""
|
||||
|
||||
options: Namespace #: The specified options for processing this PDF.
|
||||
origin: Path #: The filename of the original input file.
|
||||
pdfinfo: PdfInfo #: Detailed data for this PDF.
|
||||
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -35,21 +43,33 @@ class PdfContext:
|
||||
self.plugin_manager = plugin_manager
|
||||
|
||||
def get_path(self, name: str) -> Path:
|
||||
"""Generate a ``Path`` for an intermediate file involved in processing.
|
||||
|
||||
The path will be in a temporary folder that is common for all processing
|
||||
of this particular PDF.
|
||||
"""
|
||||
return self.work_folder / name
|
||||
|
||||
def get_page_contexts(self) -> Iterator['PageContext']:
|
||||
"""Get all ``PageContext`` for this PDF."""
|
||||
npages = len(self.pdfinfo)
|
||||
for n in range(npages):
|
||||
yield PageContext(self, n)
|
||||
|
||||
|
||||
class PageContext:
|
||||
"""Holds our context for a page
|
||||
"""Holds our context for a page.
|
||||
|
||||
Must be pickable, so stores only intrinsic/simple data elements or those
|
||||
capable of their serializing themselves via __getstate__.
|
||||
capable of their serializing themselves via ``__getstate__``.
|
||||
"""
|
||||
|
||||
options: Namespace #: The specified options for processing this PDF.
|
||||
origin: Path #: The filename of the original input file.
|
||||
pageno: int #: This page number (zero-based).
|
||||
pageinfo: PageInfo #: Information on this page.
|
||||
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
|
||||
|
||||
def __init__(self, pdf_context: PdfContext, pageno):
|
||||
self.work_folder = pdf_context.work_folder
|
||||
self.origin = pdf_context.origin
|
||||
@ -59,6 +79,11 @@ class PageContext:
|
||||
self.plugin_manager = pdf_context.plugin_manager
|
||||
|
||||
def get_path(self, name: str) -> Path:
|
||||
"""Generate a ``Path`` for a file that is part of processing this page.
|
||||
|
||||
The path will be based in a common temporary folder and have a prefix based
|
||||
on the page number.
|
||||
"""
|
||||
return self.work_folder / ("%06d_%s" % (self.pageno + 1, name))
|
||||
|
||||
def __getstate__(self):
|
||||
|
||||
@ -63,8 +63,13 @@ class NeverRaise(Exception):
|
||||
|
||||
|
||||
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
|
||||
"""
|
||||
Helper function: relinks soft symbolic link if necessary
|
||||
"""Create a symbolic link at ``soft_link_name``, which references ``input_file``.
|
||||
|
||||
Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
|
||||
|
||||
Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
|
||||
used since symlinks may require administrator privileges. An existing link at the
|
||||
destination is removed.
|
||||
"""
|
||||
input_file = os.fspath(input_file)
|
||||
soft_link_name = os.fspath(soft_link_name)
|
||||
@ -72,8 +77,8 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
|
||||
# Guard against soft linking to oneself
|
||||
if input_file == soft_link_name:
|
||||
log.warning(
|
||||
"No symbolic link made. You are using "
|
||||
"the original data directory as the working directory."
|
||||
"No symbolic link created. You are using the original data directory "
|
||||
"as the working directory."
|
||||
)
|
||||
return
|
||||
|
||||
@ -114,7 +119,7 @@ def is_iterable_notstr(thing: Any) -> bool:
|
||||
|
||||
|
||||
def monotonic(L: Sequence) -> bool:
|
||||
"""Does list increase monotonically?"""
|
||||
"""Does this sequence increase monotonically?"""
|
||||
return all(b > a for a, b in zip(L, L[1:]))
|
||||
|
||||
|
||||
@ -173,7 +178,8 @@ def is_file_writable(test_file: os.PathLike) -> bool:
|
||||
def check_pdf(input_file: Path) -> bool:
|
||||
"""Check if a PDF complies with the PDF specification.
|
||||
|
||||
Checks for proper formatting and proper linearization.
|
||||
Checks for proper formatting and proper linearization. Uses pikepdf (which in
|
||||
turn, uses QPDF) to perform the checks.
|
||||
"""
|
||||
pdf = None
|
||||
try:
|
||||
@ -217,7 +223,7 @@ def check_pdf(input_file: Path) -> bool:
|
||||
|
||||
|
||||
def clamp(n, smallest, largest): # mypy doesn't understand types for this
|
||||
"""Clamps the value of n to between smallest and largest."""
|
||||
"""Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
|
||||
return max(smallest, min(n, largest))
|
||||
|
||||
|
||||
@ -235,7 +241,7 @@ def pikepdf_enable_mmap():
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""Warn that function is deprecated"""
|
||||
"""Warn that function is deprecated."""
|
||||
|
||||
@wraps(func)
|
||||
def new_func(*args, **kwargs):
|
||||
|
||||
@ -35,7 +35,7 @@ from collections import namedtuple
|
||||
from itertools import chain
|
||||
from math import atan, cos, sin
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Any, NamedTuple, Optional, Tuple, Union
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from reportlab.lib.colors import black, cyan, magenta, red
|
||||
@ -44,7 +44,14 @@ from reportlab.pdfgen.canvas import Canvas
|
||||
|
||||
Element = ElementTree.Element
|
||||
|
||||
Rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
|
||||
|
||||
class Rect(NamedTuple):
|
||||
"""A rectangle for managing PDF coordinates."""
|
||||
|
||||
x1: Any
|
||||
y1: Any
|
||||
x2: Any
|
||||
y2: Any
|
||||
|
||||
|
||||
class HocrTransformError(Exception):
|
||||
|
||||
@ -112,7 +112,7 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
|
||||
|
||||
|
||||
def file_claims_pdfa(filename: Path):
|
||||
"""Determines if the file claims to be PDF/A compliant
|
||||
"""Determines if the file claims to be PDF/A compliant.
|
||||
|
||||
This only checks if the XMP metadata contains a PDF/A marker. It does not
|
||||
do full PDF/A validation.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user