Update README with Fedora installation instructions (#1610 )

Added instructions for Fedora users to install Tesseract language packs.
docs: Update release notes
2026-01-08 13:11:17 +00:00 · 2025-12-27 01:15:45 -08:00 · 2025-12-23 15:44:44 -08:00 · 2025-12-23 15:41:34 -08:00 · 2025-12-23 15:06:50 -08:00 · 2025-12-23 15:05:49 -08:00
42 changed files with 2620 additions and 1434 deletions
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 James R. Barlow
 # SPDX-License-Identifier: MPL-2.0

-FROM ubuntu:24.04 AS base
+FROM ubuntu:25.04 AS base

 ENV LANG=C.UTF-8
 ENV TZ=UTC
@ -40,7 +40,7 @@ RUN \
 WORKDIR /app

 # Copy uv from ghcr
-COPY --from=ghcr.io/astral-sh/uv:0.6.14 /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/

 ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy

--- a/.docker/Dockerfile.alpine
+++ b/.docker/Dockerfile.alpine
@ -1,13 +1,7 @@
 # SPDX-FileCopyrightText: 2023 James R. Barlow
 # SPDX-License-Identifier: MPL-2.0

-# Note: Alpine 3.20 builds tesseract with --enable-opencl, which is not
-# supported by anyone. OCRmyPDF is not compatible with Alpine 3.20.0
-# through 3.20.3. The issue is fixed in 3.21.
-# Details
-#  https://gitlab.alpinelinux.org/alpine/aports/-/issues/16143
-#  https://github.com/ocrmypdf/OCRmyPDF/issues/1395
-FROM alpine:3.21 AS base
+FROM alpine:3.22 AS base

 ENV LANG=C.UTF-8
 ENV TZ=UTC
@ -28,7 +22,7 @@ RUN apk add --no-cache \

 WORKDIR /app

-COPY --from=ghcr.io/astral-sh/uv:0.6.14 /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/

 ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -22,37 +22,35 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-22.04, ubuntu-24.04]
-        python: ["3.10", "3.11", "3.12", "3.13"]
+        python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
        include:
          - os: ubuntu-22.04
            tesseract_ppa: "ppa"
            python: "3.10"
-          - os: ubuntu-24.04
-            python: "pypy3.10"

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
-          version: "0.5.x"
+          version: "0.9.x"

      - name: "Set up Python"
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

      - name: Install Tesseract from PPA
        if: matrix.tesseract_ppa == 'ppa'
        run: |
-          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5.3
+          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5

      - name: Install common packages
        run: |
@ -74,14 +72,6 @@ jobs:
            unpaper \
            zlib1g

-      - name: Install Ubuntu packages for PyPy
-        if: startsWith(matrix.python, 'pypy')
-        run: |
-          sudo apt-get install -y --no-install-recommends \
-            libxml2-dev \
-            libxslt1-dev \
-            pypy3-dev
-
      - name: Install Python packages
        run: |
          uv sync --extra test --no-dev
@ -111,15 +101,15 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [macos-latest, macos-13]  # macos-latest is arm64, macos-13 is x86_64
-        python: ["3.10", "3.11", "3.12", "3.13"]
+        os: [macos-latest]
+        python: ["3.10", "3.11", "3.12", "3.13", "3.14"]

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

@ -136,12 +126,12 @@ jobs:
            tesseract

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
-          version: "0.5.x"
+          version: "0.9.x"

      - name: "Set up Python"
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

@ -174,24 +164,24 @@ jobs:
    strategy:
      matrix:
        os: [windows-latest]
-        python: ["3.10", "3.11", "3.12", "3.13"]
+        python: ["3.10", "3.11", "3.12", "3.13", "3.14"]

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
-          version: "0.5.x"
+          version: "0.9.x"

      - name: "Set up Python"
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

@ -220,20 +210,20 @@ jobs:
    name: Build sdist and wheels
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
-          version: "0.5.x"
+          version: "0.9.x"

      - name: Make wheels and sdist
        run: |
          uv build --sdist --wheel

-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@v6
        with:
          name: artifact
          path: |
@ -246,10 +236,10 @@ jobs:
    runs-on: ubuntu-latest
    environment: release
    permissions:
-      id-token: write  # mandatory for PyPI publishing
+      id-token: write # mandatory for PyPI publishing
    if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
    steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v7
        with:
          name: artifact
          path: dist
@ -267,13 +257,13 @@ jobs:
      contents: write
      id-token: write
    steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v7
        with:
          name: artifact
          path: dist

      - name: Sign the dists with Sigstore
-        uses: sigstore/gh-action-sigstore-python@v3.0.0
+        uses: sigstore/gh-action-sigstore-python@v3.2.0
        with:
          inputs: |
            ./dist/*.tar.gz
@ -318,7 +308,7 @@ jobs:
      - name: Set image name
        run: echo "DOCKER_IMAGE_NAME=ocrmypdf" >> $GITHUB_ENV

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

@ -366,7 +356,7 @@ jobs:
      - name: Set image name
        run: echo "DOCKER_IMAGE_NAME=ocrmypdf-alpine" >> $GITHUB_ENV

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags

--- a/.gitignore
+++ b/.gitignore
@ -46,4 +46,5 @@ docs/_templates/
 docs/Makefile
 src/ocrmypdf/_version.py

-.idea/
+.idea/
+.aider*
--- a/README.md
+++ b/README.md
@ -74,6 +74,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl
 | macOS (nix)                   | ``nix-env -i ocrmypdf``       |
 | LinuxBrew                     | ``brew install ocrmypdf``     |
 | FreeBSD                       | ``pkg install py-ocrmypdf``   |
+| OpenBSD                       | ``pkg_add ocrmypdf``          |
 | Ubuntu Snap                   | ``snap install ocrmypdf``     |

 For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.
@ -83,17 +84,27 @@ For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/la
 OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:

 ```bash
-# Display a list of all Tesseract language packs
-apt-cache search tesseract-ocr

 # Debian/Ubuntu users
+apt-cache search tesseract-ocr # Display a list of all Tesseract language packs
 apt-get install tesseract-ocr-chi-sim  # Example: Install Chinese Simplified language pack

+
 # Arch Linux users
 pacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs

+# OpenBSD users
+pkg_info -aQ tesseract  # Display a list of all Tesseract language packs
+pkg_add tesseract-cym  # Example: Install the Welsh language pack
+
 # brew macOS users
 brew install tesseract-lang
+
+# Fedora users
+dnf search tesseract-langpack # Display a list of all Tesseract language packs 
+dnf install tesseract-langpack-ita # Example: Install the Italian language pack
+
+
 ```

 You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
--- a/docs/batch.md
+++ b/docs/batch.md
@ -196,6 +196,10 @@ it to a OCRed PDF in `/output/`, and move the processed original to
  - Define environment variable `OCR_DESKEW` to apply deskew to crooked input PDFs
 * - `--env PYTHONBUFFERED=1`
  - This will force `STDOUT` to be unbuffered and allow you to see messages in docker logs
+* - `--env OCR_LOGLEVEL='DEBUG'`
+  - Level of log messages
+* - `--env OCR_JSON_SETTINGS={"language":"deu+eng", "rotate_pages": true}`
+  - A JSON string specifying any other arguments for `ocrmypdf.ocr`
 :::

 This service relies on polling to check for changes to the filesystem.
--- a/docs/cookbook.md
+++ b/docs/cookbook.md
@ -221,7 +221,7 @@ works if all you want to is to apply image processing or PDF/A
 conversion.

 ```bash
-ocrmypdf --tesseract-timeout=0 --remove-background input.pdf output.pdf
+ocrmypdf --tesseract-timeout 0 --remove-background input.pdf output.pdf
 ```

 :::{versionchanged} v14.1.0
@ -250,7 +250,7 @@ This command also removes OCR generated by third party tools.
 You can also optimize all images without performing any OCR:

 ```bash
-ocrmypdf --tesseract-timeout=0 --optimize 3 --skip-text input.pdf output.pdf
+ocrmypdf --tesseract-timeout 0 --optimize 3 --skip-text input.pdf output.pdf
 ```

 ### Process only certain pages
--- a/docs/docker.md
+++ b/docs/docker.md
@ -104,23 +104,29 @@ docker_ocrmypdf /data/input.pdf /data/output.pdf

 ## Podman

-Especially if you use [Podman](https://podman.io/) (or have SELinux
-enabled on your system), you may need to add `--userns keep-id` there,
-otherwise you may get access errors, because the user is otherwise not
+Especially if you use [Podman](https://podman.io/) (or use Docker in
+rootless mode), you may need to add `--userns keep-id` there,
+otherwise you may get access errors, because the user ID is otherwise not
 mapped to the same UID as on the host:

 :::{code} bash
-alias podman_ocrmypdf='podman run --rm  -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" ocrmypdf'
+alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" jbarlow83/ocrmypdf-alpine'
 podman_ocrmypdf /data/input.pdf /data/output.pdf
 :::

-If you use SELinux you may additionally need to add the `:Z` [suffix to
+If you have SELinux enabled, you may additionally need to add the `:Z` [suffix to
 the
 volume](https://docs.podman.io/en/stable/markdown/podman-run.1.html#volume-v-source-volume-host-dir-container-dir-options)
 or disable SELinux for the container using
 `--security-opt label=disable`, which is suggested for system files as
 they should not be re-labelled. Please refer to the „Note" section at
-the end of the linked podman documentation for details.
+the end of the linked podman documentation for details. This results in
+the following full command:
+
+:::{code} bash
+alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" --security-opt label=disable jbarlow83/ocrmypdf-alpine'
+podman_ocrmypdf /data/input.pdf /data/output.pdf
+:::

 {#docker-lang-packs}
 ## Adding languages to the Docker image
--- a/docs/jbig2.md
+++ b/docs/jbig2.md
@ -35,11 +35,12 @@ cd jbig2enc
 Dependencies include libtoolize and libleptonica, which on Ubuntu
 systems are packaged as libtool and libleptonica-dev. On Fedora (35)
 they are packaged as libtool and leptonica-devel. For this to work,
-please make sure to install `autotools`, `automake`, `libtool` and
-`leptonica` first if not already installed.
+please make sure to install `autotools`, `automake`, `libtool`, `pkg-config`
+and `leptonica` first if not already installed. Other dependencies might
+be required depending on your system.

 :::{code} bash
-[sudo] apt install autotools-dev automake libtool libleptonica-dev
+[sudo] apt install autotools-dev automake libtool libleptonica-dev pkg-config
 :::

 {#jbig2-lossy}
--- a/docs/plugins.md
+++ b/docs/plugins.md
@ -118,7 +118,7 @@ A plugin may provide the following hooks. Hooks must be decorated with
 `ocrmypdf.hookimpl`, for example:

 ```python
-from ocrmpydf import hookimpl
+from ocrmypdf import hookimpl

@hookimpl
 def add_options(parser):
--- a/docs/release_notes.md
+++ b/docs/release_notes.md
@ -25,6 +25,62 @@ about a forthcoming release that has not been tagged yet. A release is only
 official when it's tagged and posted to PyPI.
 :::

+## v16.13.0
+
+- Added detection and repair for Ghostscript 10.6 JPEG corruption. When GS 10.6
+  truncates JPEG data by 1-15 bytes, OCRmyPDF now restores the original image
+  bytes from the input PDF. A warning is issued when GS 10.6+ is detected.
+  {issue}`1603`
+- We continue to force re-optimization of JPEGs, since this catches some issues with corruption for situations where Ghostscript modifies an image. It is likely there are still cases where we cannot mitigate all corruption issues. {issue}`1585`
+- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being
+  processed correctly in some cases.
+- Documentation: clarified podman usage instructions.
+
+## v16.12.0
+
+- Disable Ghostscript's subset fonts feature, which was found to corrupt text in certain
+  PDFs. Thanks @mnaegler for identifying this issue. {issue}`1592`
+- Users of Ghostscript 10.6.0+ reported that Ghostscript seems to generate corrupted
+  JPEGs. We force re-optimization of these JPEGs to mitigate the corruption until
+  Ghostscript fixes the issue. {issue}`1585`
+- OCRmyPDF now avoids applying flate compression to large JPEG images, unless maximum
+  optimization is requested, since flate+DCT compression reduces performances in PDF
+  viewers with large images.
+- Updated Dockerfiles to use more recent base operating systems.
+- Updated build and test matrix to include Python 3.14.
+- Minor documentation improvements.
+- pikepdf >= 10.0.0 is now required.
+
+## v16.11.1
+
+- Fixed issue with Tesseract changing an error message related to skew. {issue}`1576`
+- Dropped macOS 13 from build-test matrix since it is no longer supported by Apple.
+
+## v16.11.0
+
+- Deprecated "semfree" plugin in favor of falling back to threads if the platform
+  does not support semaphores. Fixes an issue with Python 3.14.
+- Fixed references to PDF/A compliances levels to be consistent with ISO nomenclature.
+  Thanks @5HT2. {issue}`1557`
+- Fixed an issue around using plugin_manager as an argument. {issue}`1555`
+- Added OpenBSD install steps to README. {issue}`1554`
+- Removed PyPy from test matrix due to declining support in third party libraries.
+- Documentation improvements.
+
+## v16.10.4
+
+- Corrected build errors in Python 3.13.3 and 3.13.4.
+
+## v16.10.3 (not released)
+
+- Blocked optimization of images with pre-blended soft masks. {issue}`1536`
+- Fixed warning from hypothesis on running tests.
+- Release incomplete due to new test failures in Python 3.13.3 and 3.13.4.
+
+## v16.10.2
+
+- Blacklist pikepdf 9.8.0 due to an incompatible change.
+
 ## v16.10.1

 - No changes affecting OCRmyPDF functionality for command line end users.
--- a/misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml
+++ b/misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml
@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<component type="console-application">
+  <id>io.ocrmypdf.ocrmypdf</id>
+
+  <name>OCRmyPDF</name>
+  <summary>Adds an OCR text layer to scanned PDF files, allowing them to be searched</summary>
+
+  <developer id="io.ocrmypdf">
+      <name>OCRmyPDF Developers</name>
+  </developer>
+
+  <url type="homepage">https://github.com/ocrmypdf/ocrmypdf</url>
+  <url type="bugtracker">https://github.com/ocrmypdf/OCRmyPDF/issues</url>
+
+  <content_rating type="oars-1.1" />
+
+  <metadata_license>CC0-1.0</metadata_license>
+  <project_license>MPL-2.0</project_license>
+
+  <description>
+    <ul>
+        <li>Generates a searchable PDF/A file from a regular PDF</li>
+        <li>Places OCR text accurately below the image to ease copy / paste</li>
+        <li>Keeps the exact resolution of the original embedded images</li>
+        <li>When possible, inserts OCR information as a lossless operation without disrupting any other content</li>
+        <li>Optimizes PDF images, often producing files smaller than the input file If requested, deskews and/or cleans the image before performing OCR</li>
+        <li>Validates input and output files</li>
+        <li>Distributes work across all available CPU cores</li>
+        <li>Uses Tesseract OCR engine to recognize more than 100 languages</li>
+        <li>Keeps your private data private</li>
+        <li>Scales properly to handle files with thousands of pages</li>
+        <li>Battle-tested on millions of PDFs</li>
+    </ul>
+  </description>
+
+  <provides>
+    <binary>ocrmypdf</binary>
+  </provides>
+
+  <icon type="stock">io.ocrmypdf.ocrmypdf</icon>
+
+  <screenshots>
+    <screenshot type="default">
+      <image>https://raw.githubusercontent.com/ocrmypdf/OCRmyPDF/f7ad5f16bd0340b0b1803dada0c02f9f40542bd8/misc/flatpak/sample_screenshot.png</image>
+      <caption>Sample usage of OCRmyPDF</caption>
+    </screenshot>
+  </screenshots>
+
+  <categories>
+    <category>Office</category>
+    <category>Utility</category>
+  </categories>
+
+  <keywords>
+    <keyword>ocr</keyword>
+    <keyword>pdf</keyword>
+    <keyword>tool</keyword>
+  </keywords>
+
+  <releases>
+    <release version="16.8.0" date="2025-01-05"/>
+  </releases>
+</component>
--- a/misc/flatpak/sample_screenshot.png
+++ b/misc/flatpak/sample_screenshot.png
--- a/misc/screencast/demo.cast
+++ b/misc/screencast/demo.cast
@ -60,6 +60,6 @@
 [8.280789, "o", "\rRecompressing JPEGs: 0image [00:00, ?image/s]\rRecompressing JPEGs: 0image [00:00, ?image/s]\r\n\rDeflating JPEGs:   0%|                                                                                    | 0/4 [00:00<?, ?image/s]\rDeflating JPEGs: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 238.28image/s]\r\n"]
 [8.28149, "o", "\rJBIG2: 0item [00:00, ?item/s]\rJBIG2: 0item [00:00, ?item/s]\r\n"]
 [8.289998, "o", "Image optimization ratio: 1.01 savings: 1.3%\r\nTotal file size ratio: 1.02 savings: 1.6%\r\n"]
-[8.291209, "o", "Output file is a PDF/A-2B (as expected)\r\n"]
+[8.291209, "o", "Output file is a PDF/A-2b (as expected)\r\n"]
 [8.361316, "o", "\u001b[2m⏎\u001b(B\u001b[m                                                                                                                                  \r⏎ \r\u001b[K\u001b[?2004h\u001b]0;fish /home/jb/src/ocrmypdf/tests/resources\u0007\u001b[30m\u001b(B\u001b[m> \u001b[K\r\u001b[C\u001b[C"]
 [8.862206, "o", "\r\n\u001b[30m\u001b(B\u001b[m\u001b[30m\u001b(B\u001b[m\u001b[?2004l"]
--- a/misc/screencast/demo.svg
+++ b/misc/screencast/demo.svg
--- a/misc/watcher.py
+++ b/misc/watcher.py
@ -5,8 +5,7 @@

 """Watch a directory for new PDFs and OCR them."""

-# Do not enable annotations!
-# https://github.com/tiangolo/typer/discussions/598
+from __future__ import annotations

 import json
 import logging
@ -278,11 +277,11 @@ def main(
        f"Output Directory Year & Month: {output_dir_year_month}\n"
        f"Archive Directory: {archive_dir}"
    )
-    log.debug(
+    log.info(
        f"INPUT_DIRECTORY: {input_dir}\n"
        f"OUTPUT_DIRECTORY: {output_dir}\n"
-        f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
        f"ARCHIVE_DIRECTORY: {archive_dir}\n"
+        f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
        f"ON_SUCCESS_DELETE: {on_success_delete}\n"
        f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
        f"DESKEW: {deskew}\n"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,7 +17,7 @@ dependencies = [
  "packaging>=20",
  "pdfminer.six>=20220319",
  "pi-heif",                # Heif image format - maintainers: if this is removed, it will NOT break
-  "pikepdf>=8.10.1",
+  "pikepdf>=10",
  "Pillow>=10.0.1",
  "pluggy>=1",
  "rich>=13",
@ -45,15 +45,10 @@ keywords = ["PDF", "OCR", "optical character recognition", "PDF/A", "scanning"]
 Documentation = "https://ocrmypdf.readthedocs.io/"
 Source = "https://github.com/ocrmypdf/OCRmyPDF"
 Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
-Changelog = "https://github.com/ocrmypdf/OCRmyPDF/docs/release_notes.rst"
+Changelog = "https://github.com/ocrmypdf/OCRmyPDF/docs/release_notes.md"

 [project.optional-dependencies]
-docs = [
-    "myst-parser>=4.0.1",
-    "sphinx",
- "sphinx-issues",
- "sphinx-rtd-theme",
-]
+docs = ["myst-parser>=4.0.1", "sphinx", "sphinx-issues", "sphinx-rtd-theme"]
 extended_test = ["PyMuPDF>=1.19.1"]
 test = [
  "coverage[toml]>=6.2",
@ -107,7 +102,6 @@ exclude_lines = [

 [tool.pytest.ini_options]
 minversion = "6.0"
-norecursedirs = ["lib", ".pc", ".git", "venv", "output", "cache", "resources"]
 testpaths = ["tests"]
 addopts = "-n auto"
 markers = ["slow"]
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -78,6 +78,7 @@ def run(args=None):

 if __name__ == '__main__':
    multiprocessing.freeze_support()
-    if os.name == 'posix':
-        multiprocessing.set_start_method('forkserver')
+    if sys.platform not in ('win32', 'darwin'):
+        with suppress(RuntimeError):
+            multiprocessing.set_start_method('forkserver')
    sys.exit(run())
--- a/src/ocrmypdf/_exec/ghostscript.py
+++ b/src/ocrmypdf/_exec/ghostscript.py
@ -286,6 +286,7 @@ def generate_pdfa(
        + compression_args
        + [
            "-dJPEGQ=95",
+            "-dSubsetFonts=false",  # Prevents GS from messing up some encodings
            f"-dPDFA={pdfa_part}",
            "-dPDFACompatibilityPolicy=1",
            "-o",
--- a/src/ocrmypdf/_exec/tesseract.py
+++ b/src/ocrmypdf/_exec/tesseract.py
@ -186,6 +186,18 @@ def get_orientation(
    return orient_conf


+def _is_empty_page_error(exc):
+    if b'Empty page!!' in exc.output:  # Tesseract 4.x
+        return True
+
+    return exc.returncode == 1 and (
+        # Tesseract 5.0-5.4 or so
+        exc.output == b''
+        # Tesseract 5.5+
+        or exc.output.startswith(b"Error in boxClipToRectangle: box outside rectangle")
+    )
+
+
 def get_deskew(
    input_file: Path, languages: list[str], engine_mode: int | None, timeout: float
 ) -> float:
@ -204,11 +216,9 @@ def get_deskew(
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
-        if b'Empty page!!' in e.output or (
-            e.output == b'' and e.returncode == 1
-        ):  # Not enough info for a skew angle - Tess 4 and 5 return different errors
+        if _is_empty_page_error(e):
+            # Not enough info for a skew angle
            return 0.0
-
        raise SubprocessOutputError() from e

    parsed = _parse_tesseract_output(p.stdout)
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@ -39,6 +39,7 @@ from ocrmypdf.hocrtransform import DebugRenderOptions, HocrTransform
 from ocrmypdf.hocrtransform._font import Courier
 from ocrmypdf.pdfa import generate_pdfa_ps
 from ocrmypdf.pdfinfo import Colorspace, Encoding, PageInfo, PdfInfo
+from ocrmypdf.pdfinfo.info import FloatRect
 from ocrmypdf.pluginspec import OrientationConfidence

 try:
@ -549,7 +550,9 @@ def rasterize(

    device = colorspaces[device_idx]

-    log.debug(f"Rasterize with {device}, rotation {correction}")
+    log.debug(
+        f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
+    )

    canvas_dpi, page_dpi = calculate_raster_dpi(page_context)

@ -830,6 +833,23 @@ def _offset_rect(rect: tuple[float, float, float, float], offset: tuple[float, f
    )


+def _adjust_pagebox(
+    page: pikepdf.Page,
+    media_box: FloatRect,
+    name: pikepdf.Name,
+    target_box: FloatRect,
+    offset: tuple[float, float],
+    swap_axis: bool,
+):
+    if media_box == target_box:
+        return
+    box = _offset_rect(target_box, offset)
+    if swap_axis:
+        box = box[1], box[0], box[3], box[2]
+    page[name] = box
+    log.debug(f"{str(name)} = {target_box}")
+
+
 def fix_pagepdf_boxes(
    infile: Path | BinaryIO,
    out_file: Path,
@ -840,7 +860,7 @@ def fix_pagepdf_boxes(

    The single page PDF is created with a normal MediaBox with its lower left corner
    at (0, 0). infile is the single page PDF. page_context.mediabox has the original
-    file's mediabox, which may have a different origin. We needto adjust the other
+    file's mediabox, which may have a different origin. We need to adjust the other
    boxes in the single page PDF to match the effect they had on the original page.

    When correcting page rotation, we create a single page PDF that is correctly
@ -854,18 +874,25 @@ def fix_pagepdf_boxes(
    """
    with pikepdf.open(infile) as pdf:
        for page in pdf.pages:
-            # page.BleedBox = page_context.pageinfo.bleedbox
-            # page.ArtBox = page_context.pageinfo.artbox
+            log.debug(
+                f"initial mediabox={page.MediaBox} and pageinfo "
+                f"mediabox={page_context.pageinfo.mediabox}"
+            )
            mediabox = page_context.pageinfo.mediabox
-            offset = mediabox[0], mediabox[1]
-            cropbox = _offset_rect(page_context.pageinfo.cropbox, offset)
-            trimbox = _offset_rect(page_context.pageinfo.trimbox, offset)
-
+            offset = -mediabox[0], -mediabox[1]
            if swap_axis:
-                cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
-                trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
-            page.CropBox = cropbox
-            page.TrimBox = trimbox
+                mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
+            boxes = ['CropBox', 'TrimBox', 'ArtBox', 'BleedBox']
+            for box_name in boxes:
+                _adjust_pagebox(
+                    page,
+                    mediabox,
+                    pikepdf.Name(f"/{box_name}"),
+                    getattr(page_context.pageinfo, box_name.lower()),
+                    offset,
+                    swap_axis,
+                )
+
        pdf.save(out_file)
    return out_file

--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@ -73,19 +73,10 @@ class OcrmypdfPluginManager(pluggy.PluginManager):
                module = importlib.import_module(name)
                self.register(module)

-        # 2. Install semfree if needed
-        try:
-            # pylint: disable=import-outside-toplevel
-            from multiprocessing.synchronize import SemLock
-
-            del SemLock
-        except ImportError:
-            self.register(importlib.import_module('ocrmypdf.extra_plugins.semfree'))
-
-        # 3. Register setuptools plugins
+        # 2. Register setuptools plugins
        self.load_setuptools_entrypoints('ocrmypdf')

-        # 4. Register plugins specified on command line
+        # 3. Register plugins specified on command line
        for name in self.__plugins:
            if isinstance(name, Path) or name.endswith('.py'):
                # Import by filename
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@ -1,7 +1,41 @@
 # SPDX-FileCopyrightText: 2022 James R. Barlow
 # SPDX-License-Identifier: MPL-2.0

-"""Functions for using ocrmypdf as an API."""
+"""Python API for OCRmyPDF.
+
+This module provides the main Python API for OCRmyPDF, allowing you to perform
+OCR operations programmatically without using the command line interface.
+
+Main Functions:
+    ocr(): The primary function for OCR processing. Takes an input PDF or image
+        file and produces an OCR'd PDF with searchable text.
+
+    configure_logging(): Set up logging to match the command line interface
+        behavior, with support for progress bars and colored output.
+
+Experimental Functions:
+    _pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for
+        manual editing before final PDF generation.
+
+    _hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after
+        manual text corrections.
+
+The API maintains thread safety through internal locking since OCRmyPDF uses
+global state for plugins. Only one OCR operation can run per Python process
+at a time. For parallel processing, use multiple Python processes.
+
+Example:
+    import ocrmypdf
+
+    # Configure logging (optional)
+    ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
+
+    # Perform OCR
+    ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')
+
+For detailed parameter documentation, see the ocr() function docstring and
+the equivalent command line parameters in the OCRmyPDF documentation.
+"""

 from __future__ import annotations

@ -357,7 +391,7 @@ def ocr(  # noqa: D417
    create_options_kwargs = {
        k: v
        for k, v in locals().items()
-        if k not in {'input_file', 'output_file', 'kwargs'}
+        if k not in {'input_file', 'output_file', 'kwargs', 'plugin_manager'}
    }
    create_options_kwargs.update(kwargs)

--- a/src/ocrmypdf/builtin_plugins/concurrency.py
+++ b/src/ocrmypdf/builtin_plugins/concurrency.py
@ -96,6 +96,31 @@ def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
    return


+def setup_executor(use_threads: bool) -> tuple[Queue, Executor, WorkerInit]:
+    if not use_threads:
+        # Some execution environments like AWS Lambda and Termux do not support
+        # semaphores. Check if semaphore support is available, and if not, fall back
+        # to using threads.
+        try:
+            # pylint: disable=import-outside-toplevel
+            from multiprocessing.synchronize import SemLock
+
+            del SemLock
+        except ImportError:
+            use_threads = True
+
+    if use_threads:
+        loq_queue = queue.Queue(-1)
+        executor_class = ThreadPoolExecutor
+        initializer = thread_init
+    else:
+        loq_queue = multiprocessing.Queue(-1)
+        executor_class = ProcessPoolExecutor
+        initializer = process_init
+
+    return loq_queue, executor_class, initializer
+
+
 class StandardExecutor(Executor):
    """Standard OCRmyPDF concurrent task executor."""

@ -110,14 +135,7 @@ class StandardExecutor(Executor):
        task_arguments: Iterable,
        task_finished: Callable,
    ):
-        if use_threads:
-            log_queue: Queue = queue.Queue(-1)
-            executor_class: FuturesExecutorClass = ThreadPoolExecutor
-            initializer: WorkerInit = thread_init
-        else:
-            log_queue = multiprocessing.Queue(-1)
-            executor_class = ProcessPoolExecutor
-            initializer = process_init
+        log_queue, executor_class, initializer = setup_executor(use_threads)

        # Regardless of whether we use_threads for worker processes, the log_listener
        # must be a thread. Make sure we create the listener after the worker pool,
--- a/src/ocrmypdf/builtin_plugins/ghostscript.py
+++ b/src/ocrmypdf/builtin_plugins/ghostscript.py
@ -5,8 +5,10 @@
 from __future__ import annotations

 import logging
+from pathlib import Path

 from packaging.version import Version
+from pikepdf import Name, Pdf, Stream

 from ocrmypdf import hookimpl
 from ocrmypdf._exec import ghostscript
@ -74,6 +76,13 @@ def check_options(options):
            "use --force-ocr to discard existing text."
        )

+    if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'):
+        log.warning(
+            "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
+            "images. OCRmyPDF will attempt to mitigate, but this version is "
+            "strongly not recommended. Please upgrade to a newer version. "
+            "As of 2025-12, 10.6.0 is the latest version of Ghostscript."
+        )
    if options.output_type == 'pdfa':
        options.output_type = 'pdfa-2'
    if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES:
@ -116,6 +125,144 @@ def rasterize_pdf_page(
    return output_file


+def _collect_dctdecode_images(pdf: Pdf) -> dict[tuple, list[tuple[Stream, bytes]]]:
+    """Collect all DCTDecode (JPEG) images from a PDF.
+
+    Returns a dict mapping image signatures to a list of (stream, raw_bytes) tuples.
+    The signature is (Width, Height, Filter, BitsPerComponent, ColorSpace).
+    """
+    images: dict[tuple, list[tuple[Stream, bytes]]] = {}
+
+    def get_colorspace_key(obj):
+        """Get a hashable key for the colorspace."""
+        cs = obj.get(Name.ColorSpace)
+        if cs is None:
+            return None
+        if isinstance(cs, Name):
+            return str(cs)
+        # For array colorspaces like [/ICCBased ...], use the first element
+        try:
+            return str(cs[0]) if len(cs) > 0 else str(cs)
+        except (TypeError, KeyError):
+            return str(cs)
+
+    def process_xobject_dict(xobjects, depth=0):
+        """Process an XObject dictionary for DCTDecode images."""
+        if xobjects is None:
+            return
+        if depth > 10:
+            log.warning("Recursion depth exceeded in _collect_dctdecode_images")
+            return
+        for key in xobjects.keys():
+            obj = xobjects[key]
+            if obj is None:
+                continue
+            # Check if it's an image with DCTDecode
+            if obj.get(Name.Subtype) == Name.Image:
+                filt = obj.get(Name.Filter)
+                if filt == Name.DCTDecode:
+                    sig = (
+                        int(obj.get(Name.Width, 0)),
+                        int(obj.get(Name.Height, 0)),
+                        str(filt),
+                        int(obj.get(Name.BitsPerComponent, 0)),
+                        get_colorspace_key(obj),
+                    )
+                    raw_bytes = obj.read_raw_bytes()
+                    if sig not in images:
+                        images[sig] = []
+                    images[sig].append((obj, raw_bytes))
+            # Recurse into Form XObjects
+            elif obj.get(Name.Subtype) == Name.Form:
+                if Name.Resources in obj:
+                    res = obj[Name.Resources]
+                    if Name.XObject in res:
+                        process_xobject_dict(res[Name.XObject], depth=depth + 1)
+
+    for page in pdf.pages:
+        if Name.Resources not in page:
+            continue
+        resources = page[Name.Resources]
+        if Name.XObject not in resources:
+            continue
+        process_xobject_dict(resources[Name.XObject])
+
+    return images
+
+
+def _repair_gs106_jpeg_corruption(
+    input_pdf_path: Path,
+    output_pdf_path: Path,
+) -> bool:
+    """Repair JPEG corruption caused by Ghostscript 10.6.
+
+    Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes.
+    This function detects and repairs such corruption by copying the
+    original JPEG bytes from the input PDF.
+
+    Returns True if any repairs were made.
+    """
+    repaired_count = 0
+    first_error_logged = False
+
+    with (
+        Pdf.open(input_pdf_path) as input_pdf,
+        Pdf.open(output_pdf_path, allow_overwriting_input=True) as output_pdf,
+    ):
+        # Collect all DCTDecode images from both PDFs
+        input_images = _collect_dctdecode_images(input_pdf)
+        output_images = _collect_dctdecode_images(output_pdf)
+
+        # For each output image, try to find a corresponding input image
+        for sig, output_list in output_images.items():
+            if sig not in input_images:
+                continue
+            input_list = input_images[sig]
+
+            for output_stream, output_bytes in output_list:
+                # Try to find a matching input image
+                for _input_stream, input_bytes in input_list:
+                    input_len = len(input_bytes)
+                    output_len = len(output_bytes)
+
+                    # Check if output is 1-15 bytes shorter
+                    diff = input_len - output_len
+                    if not (1 <= diff <= 15):
+                        continue
+
+                    # Check if the bytes are identical up to the truncation point
+                    if output_bytes != input_bytes[:output_len]:
+                        continue
+
+                    # This is a corrupt image - repair it
+                    if not first_error_logged:
+                        log.error(
+                            "Ghostscript 10.6 JPEG corruption detected. "
+                            "Repairing damaged images from original PDF."
+                        )
+                        first_error_logged = True
+                    log.warning(
+                        f"Replacing corrupt JPEG image "
+                        f"({sig[0]}x{sig[1]}, {diff} bytes truncated)"
+                    )
+
+                    # Write the original bytes back to the output stream
+                    output_stream.write(
+                        input_bytes,
+                        filter=Name.DCTDecode,
+                    )
+                    repaired_count += 1
+                    break  # Move to next output image
+
+        if repaired_count > 0:
+            output_pdf.save(output_pdf_path)
+            log.info(
+                f"Repaired {repaired_count} JPEG image(s) corrupted by Ghostscript"
+            )
+
+    return repaired_count > 0
+
+
@hookimpl
 def generate_pdfa(
    pdf_pages,
@ -138,4 +285,11 @@ def generate_pdfa(
        progressbar_class=progressbar_class,
        stop_on_error=stop_on_soft_error,
    )
+
+    # Repair JPEG corruption caused by Ghostscript 10.6.x
+    gs_version = ghostscript.version()
+    if gs_version >= Version('10.6.0') and len(pdf_pages) == 1:
+        input_pdf = Path(pdf_pages[0])
+        _repair_gs106_jpeg_corruption(input_pdf, Path(output_file))
+
    return output_file
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@ -196,8 +196,8 @@ Online documentation is located at:
        "for users who want their file altered as little as possible. 'pdfa' "
        "also has problems with full Unicode text. 'pdf' minimizes changes "
        "to the input file. 'pdf-a1' creates a "
-        "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
-        "PDF/A3-b file. 'none' will produce no output, which may be helpful if "
+        "PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
+        "PDF/A-3b file. 'none' will produce no output, which may be helpful if "
        "only the --sidecar is desired.",
    )

--- a/src/ocrmypdf/extra_plugins/init.py
+++ b/src/ocrmypdf/extra_plugins/init.py
@ -2,8 +2,4 @@
 #
 # SPDX-License-Identifier: MPL-2.0

-"""Extra plugins. These are not automatically inserted when ocrmypdf is run.
-
-You can use these plugins by specifying them on the command line, e.g.:
-ocrmypdf --plugin ocrmypdf.extra_plugins.semfree ...
-"""
+"""Extra plugins. These are not automatically inserted when ocrmypdf is run."""
--- a/src/ocrmypdf/extra_plugins/semfree.py
+++ b/src/ocrmypdf/extra_plugins/semfree.py
@ -13,6 +13,9 @@ worker communicates only with the main process.
 This is not without drawbacks. If the tasks are not "even" in size, which cannot
 be guaranteed, some workers may end up with too much work while others are idle.
 It is less efficient than the standard implementation, so not the default.
+
+This module is deprecated and will be removed in a future release. The standard
+executor will fall back to threads in these environments.
 """

 from __future__ import annotations
@ -20,6 +23,7 @@ from __future__ import annotations
 import logging
 import logging.handlers
 import signal
+import warnings
 from collections.abc import Callable, Iterable, Iterator
 from contextlib import suppress
 from enum import Enum, auto
@ -32,6 +36,11 @@ from ocrmypdf._concurrent import NullProgressBar
 from ocrmypdf.exceptions import InputFileError
 from ocrmypdf.helpers import remove_all_log_handlers

+warnings.warn(
+    "semfree.py is deprecated and will be removed in a future release.",
+    DeprecationWarning,
+)
+

 class MessageType(Enum):
    """Implement basic IPC messaging."""
--- a/src/ocrmypdf/helpers.py
+++ b/src/ocrmypdf/helpers.py
@ -270,7 +270,7 @@ def check_pdf(input_file: Path) -> bool:
        with pdf:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', message=r'pikepdf.*JBIG2.*')
-                messages = pdf.check()
+                messages = pdf.check_pdf_syntax()
            success = True
            for msg in messages:
                if 'error' in msg.lower():
--- a/src/ocrmypdf/optimize.py
+++ b/src/ocrmypdf/optimize.py
@ -17,6 +17,7 @@ from typing import Any, NamedTuple, NewType
 from zlib import compress

 import img2pdf
+from packaging.version import Version
 from pikepdf import (
    Dictionary,
    Name,
@ -32,7 +33,7 @@ from pikepdf.models.image import HifiPrintImageNotTranscodableError
 from PIL import Image

 from ocrmypdf._concurrent import Executor, SerialExecutor
-from ocrmypdf._exec import jbig2enc, pngquant
+from ocrmypdf._exec import ghostscript, jbig2enc, pngquant
 from ocrmypdf._jobcontext import PdfContext
 from ocrmypdf._progressbar import ProgressBar
 from ocrmypdf.exceptions import OutputFileAccessError
@ -42,6 +43,7 @@ log = logging.getLogger(__name__)

 DEFAULT_JPEG_QUALITY = 75
 DEFAULT_PNG_QUALITY = 70
+FLATE_JPEG_THRESHOLD = 10000


 Xref = NewType('Xref', int)
@ -126,6 +128,13 @@ def extract_image_filter(
    if Name.Decode in image:
        log.debug(f"xref {xref}: skipping image with Decode table")
        return None  # Don't mess with custom Decode tables
+    if image.get(Name.SMask, Dictionary()).get(Name.Matte, None) is not None:
+        # https://github.com/ocrmypdf/OCRmyPDF/issues/1536
+        # Do not attempt to optimize images that have a SMask with a Matte.
+        # That means alpha channel pre-blending is used, and we're not prepared
+        # to deal with the complexities of that.
+        log.debug(f"xref {xref}: skipping image whose SMask has Matte")
+        return None

    return pim, filtdp

@ -182,6 +191,16 @@ def extract_image_jbig2(
    return None


+def _should_optimize_jpeg(options, filtdp):
+    if options.optimize >= 2:
+        return True
+    if options.optimize < 2 and ghostscript.version() >= Version('10.6.0'):
+        # Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.
+        # To resolve this, re-optimize the JPEG anyway.
+        return True
+    return False
+
+
 def extract_image_generic(
    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
 ) -> XrefExt | None:
@ -195,15 +214,7 @@ def extract_image_generic(
    if pim.bits_per_component == 1:
        return None

-    if filtdp[0] == Name.DCTDecode and options.optimize >= 2:
-        # This is a simple heuristic derived from some training data, that has
-        # about a 70% chance of guessing whether the JPEG is high quality,
-        # and possibly recompressible, or not. The number itself doesn't mean
-        # anything.
-        # bytes_per_pixel = int(raw_jpeg.Length) / (w * h)
-        # jpeg_quality_estimate = 117.0 * (bytes_per_pixel ** 0.213)
-        # if jpeg_quality_estimate < 65:
-        #     return None
+    if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):
        try:
            imgname = root / f'{xref:08d}'
            with imgname.open('wb') as f:
@ -521,7 +532,19 @@ def _find_deflatable_jpeg(
        return None
    _pim, filtdp = result

-    if filtdp[0] == Name.DCTDecode and not filtdp[1] and options.optimize >= 1:
+    if (
+        filtdp[0] == Name.DCTDecode
+        and not filtdp[1]
+        and (
+            (
+                # Don't flate very large images because it will slow down PDF viewers
+                1 <= options.optimize <= 2
+                and image.get(Name.Width, 0) < FLATE_JPEG_THRESHOLD
+                and image.get(Name.Height, 0) < FLATE_JPEG_THRESHOLD
+            )
+            or options.optimize == 3
+        )
+    ):
        return XrefExt(xref, '.memory')

    return None
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@ -120,10 +120,13 @@ def file_claims_pdfa(filename: Path):
                'output': 'pdf',
                'conformance': 'No PDF/A metadata in XMP',
            }
-        valid_part_conforms = {'1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U'}
-        conformance = f'PDF/A-{pdfmeta.pdfa_status}'
+        valid_part_conforms = {'1a', '1b', '2a', '2b', '2u', '3a', '3b', '3u'}
+        # Raw value in XMP metadata returned by pikepdf is uppercase, but ISO
+        # uses lower case for conformance levels.
+        pdfa_status_iso = pdfmeta.pdfa_status.lower()
+        conformance = f'PDF/A-{pdfa_status_iso}'
        pdfa_dict: dict[str, str | bool] = {}
-        if pdfmeta.pdfa_status in valid_part_conforms:
+        if pdfa_status_iso in valid_part_conforms:
            pdfa_dict['pass'] = True
            pdfa_dict['output'] = 'pdfa'
        pdfa_dict['conformance'] = conformance
--- a/src/ocrmypdf/pdfinfo/info.py
+++ b/src/ocrmypdf/pdfinfo/info.py
@ -901,8 +901,8 @@ class PageInfo:
        width_pt = mediabox[2] - mediabox[0]
        height_pt = mediabox[3] - mediabox[1]

-        # self._artbox = [float(d) for d in page.artbox.as_list()]
-        # self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
+        self._artbox = [float(d) for d in page.artbox.as_list()]
+        self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
        self._cropbox = [float(d) for d in page.cropbox.as_list()]
        self._mediabox = [float(d) for d in page.mediabox.as_list()]
        self._trimbox = [float(d) for d in page.trimbox.as_list()]
@ -1039,6 +1039,16 @@ class PageInfo:
        """Return trimbox of page in PDF coordinates."""
        return self._trimbox

+    @property
+    def artbox(self) -> FloatRect:
+        """Return artbox of page in PDF coordinates."""
+        return self._artbox
+
+    @property
+    def bleedbox(self) -> FloatRect:
+        """Return bleedbox of page in PDF coordinates."""
+        return self._bleedbox
+
    @property
    def images(self) -> list[ImageInfo]:
        """Return images."""
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@ -492,7 +492,7 @@ def generate_pdfa(
        pdf_version: The minimum PDF version that the output file should be.
            At its own discretion, the PDF/A generator may raise the version,
            but should not lower it.
-        pdfa_part: The desired PDF/A compliance level, such as ``'2B'``.
+        pdfa_part: The desired PDF/A compliance level, such as ``'2b'``.
        progressbar_class: The class of a progress bar, which must implement
            the ProgressBar protocol. If None, no progress is reported.
        stop_on_soft_error: If there is an "soft error" such that PDF/A generation
--- a/tests/test_ghostscript.py
+++ b/tests/test_ghostscript.py
@ -6,6 +6,7 @@ from __future__ import annotations
 import logging
 import secrets
 import subprocess
+import sys
 from decimal import Decimal
 from unittest.mock import patch

@ -16,6 +17,7 @@ from PIL import Image, UnidentifiedImageError

 from ocrmypdf._exec import ghostscript
 from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
+from ocrmypdf.builtin_plugins.ghostscript import _repair_gs106_jpeg_corruption
 from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError
 from ocrmypdf.helpers import Resolution

@ -165,6 +167,10 @@ class TestDuplicateFilter:
        logger.addFilter(DuplicateFilter(logger))
        return logger

+    @pytest.mark.xfail(
+        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
+        reason="https://github.com/python/cpython/pull/135858",
+    )
    def test_filter_duplicate_messages(self, duplicate_filter_logger, caplog):
        log = duplicate_filter_logger
        log.error("test error message")
@ -194,6 +200,10 @@ class TestDuplicateFilter:
        assert caplog.records[1].msg == "another error message"
        assert caplog.records[2].msg == "yet another error message"

+    @pytest.mark.xfail(
+        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
+        reason="https://github.com/python/cpython/pull/135858",
+    )
    def test_filter_alt_messages(self, duplicate_filter_logger, caplog):
        log = duplicate_filter_logger
        log.error("test error message")
@ -278,3 +288,120 @@ def test_recoverable_image_error_with_stop(pdf_with_invalid_image, outdir, caplo
            stop_on_error=True,
        )
    # out2.png will not be created; if it were it would be blank.
+
+
+class TestGs106JpegCorruptionRepair:
+    """Test the Ghostscript 10.6 JPEG corruption repair function."""
+
+    @pytest.fixture
+    def create_damaged_pdf(self, resources, outdir):
+        """Create a damaged PDF by truncating JPEG data by 2 bytes."""
+
+        def _create_damaged(source_pdf_name='francais.pdf', truncate_bytes=2):
+            source_path = resources / source_pdf_name
+            damaged_path = outdir / 'damaged.pdf'
+
+            with pikepdf.open(source_path) as pdf:
+                # Find and truncate DCTDecode images
+                Name = pikepdf.Name
+                damaged_count = 0
+                for page in pdf.pages:
+                    if Name.Resources not in page:
+                        continue
+                    resources_dict = page[Name.Resources]
+                    if Name.XObject not in resources_dict:
+                        continue
+                    for key in resources_dict[Name.XObject].keys():
+                        obj = resources_dict[Name.XObject][key]
+                        if obj.get(Name.Subtype) != Name.Image:
+                            continue
+                        if obj.get(Name.Filter) != Name.DCTDecode:
+                            continue
+                        # Truncate the JPEG data
+                        original_bytes = obj.read_raw_bytes()
+                        truncated_bytes = original_bytes[:-truncate_bytes]
+                        obj.write(truncated_bytes, filter=Name.DCTDecode)
+                        damaged_count += 1
+
+                pdf.save(damaged_path)
+                return source_path, damaged_path, damaged_count
+
+        return _create_damaged
+
+    def test_repair_truncated_jpeg(self, create_damaged_pdf, caplog):
+        """Test that truncated JPEG images are repaired."""
+        caplog.set_level(logging.DEBUG)
+        source_path, damaged_path, damaged_count = create_damaged_pdf()
+
+        assert damaged_count > 0, "Test PDF should have DCTDecode images"
+
+        # Get original image bytes for comparison
+        with pikepdf.open(source_path) as pdf:
+            Name = pikepdf.Name
+            original_bytes_list = []
+            for page in pdf.pages:
+                if Name.Resources not in page:
+                    continue
+                resources_dict = page[Name.Resources]
+                if Name.XObject not in resources_dict:
+                    continue
+                for key in resources_dict[Name.XObject].keys():
+                    obj = resources_dict[Name.XObject][key]
+                    if obj.get(Name.Subtype) != Name.Image:
+                        continue
+                    if obj.get(Name.Filter) != Name.DCTDecode:
+                        continue
+                    original_bytes_list.append(obj.read_raw_bytes())
+
+        # Run the repair function
+        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
+        assert repaired is True, "Repair should have been performed"
+
+        # Verify the repaired PDF has correct image bytes
+        with pikepdf.open(damaged_path) as pdf:
+            Name = pikepdf.Name
+            repaired_bytes_list = []
+            for page in pdf.pages:
+                if Name.Resources not in page:
+                    continue
+                resources_dict = page[Name.Resources]
+                if Name.XObject not in resources_dict:
+                    continue
+                for key in resources_dict[Name.XObject].keys():
+                    obj = resources_dict[Name.XObject][key]
+                    if obj.get(Name.Subtype) != Name.Image:
+                        continue
+                    if obj.get(Name.Filter) != Name.DCTDecode:
+                        continue
+                    repaired_bytes_list.append(obj.read_raw_bytes())
+
+        assert len(repaired_bytes_list) == len(original_bytes_list)
+        for orig, repaired_bytes in zip(original_bytes_list, repaired_bytes_list):
+            assert orig == repaired_bytes, "Repaired bytes should match original"
+
+        # Check that error/warning was logged
+        assert "JPEG corruption detected" in caplog.text
+
+    def test_no_repair_when_not_truncated(self, resources, outdir, caplog):
+        """Test that no repair is done when images are not truncated."""
+        caplog.set_level(logging.DEBUG)
+        source_path = resources / 'francais.pdf'
+
+        # Copy source to output (no damage)
+        output_path = outdir / 'undamaged.pdf'
+        with pikepdf.open(source_path) as pdf:
+            pdf.save(output_path)
+
+        # Run the repair function - should not repair anything
+        repaired = _repair_gs106_jpeg_corruption(source_path, output_path)
+        assert repaired is False, "No repair should have been performed"
+        assert "JPEG corruption detected" not in caplog.text
+
+    def test_no_repair_when_truncation_too_large(self, create_damaged_pdf, caplog):
+        """Test that images truncated by more than 15 bytes are not repaired."""
+        caplog.set_level(logging.DEBUG)
+        source_path, damaged_path, _ = create_damaged_pdf(truncate_bytes=20)
+
+        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
+        assert repaired is False, "Should not repair truncation > 15 bytes"
+        assert "JPEG corruption detected" not in caplog.text
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -801,7 +801,7 @@ def test_pdfa_n(pdfa_level, resources, outpdf):
    )

    pdfa_info = file_claims_pdfa(outpdf)
-    assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}B'
+    assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}b'


 def test_decompression_bomb_error(resources, outpdf, caplog):
--- a/tests/test_optimize.py
+++ b/tests/test_optimize.py
@ -6,7 +6,7 @@ from __future__ import annotations
 from io import BytesIO
 from os import fspath
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch

 import img2pdf
 import pikepdf
@ -220,7 +220,7 @@ def test_find_formx(resources):


 def test_extract_image_filter_with_pdf_image():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
@ -235,20 +235,20 @@ def test_extract_image_filter_with_pdf_image():


 def test_extract_image_filter_with_non_image():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Form
    assert extract_image_filter(image, None) is None


 def test_extract_image_filter_with_small_stream_size():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 50
    assert extract_image_filter(image, None) is None


 def test_extract_image_filter_with_small_dimensions():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 5
@ -257,7 +257,7 @@ def test_extract_image_filter_with_small_dimensions():


 def test_extract_image_filter_with_multiple_compression_filters():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
@ -268,7 +268,7 @@ def test_extract_image_filter_with_multiple_compression_filters():


 def test_extract_image_filter_with_wide_gamut_image():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
@ -296,7 +296,7 @@ def test_extract_image_filter_with_jpeg2000_image():


 def test_extract_image_filter_with_ccitt_group_3_image():
-    image = MagicMock()
+    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
@ -309,7 +309,7 @@ def test_extract_image_filter_with_ccitt_group_3_image():

 # Triggers pikepdf bug
 # def test_extract_image_filter_with_decode_table():
-#     image = MagicMock()
+#     image = Dictionary()
 #     image.Subtype = Name.Image
 #     image.Length = 200
 #     image.Width = 10
@ -319,3 +319,26 @@ def test_extract_image_filter_with_ccitt_group_3_image():
 #     image.ColorSpace = Name.DeviceGray
 #     image.Decode = [42, 0]
 #     assert extract_image_filter(image, None) is None
+
+
+def test_extract_image_filter_with_rgb_smask_matte():
+    image = Dictionary()
+    image.Subtype = Name.Image
+    image.Length = 200
+    image.Width = 10
+    image.Height = 10
+    image.Filter = Name.FlateDecode
+    image.BitsPerComponent = 8
+    image.ColorSpace = Name.DeviceRGB
+    image.SMask = Dictionary(
+        Type=Name.Image,
+        Subtype=Name.Image,
+        Length=200,
+        Width=10,
+        Height=10,
+        Filter=Name.FlateDecode,
+        BitsPerComponent=8,
+        ColorSpace=Name.DeviceGray,
+        Matte=Array([1, 2, 3]),
+    )
+    assert extract_image_filter(image, None) is None
--- a/tests/test_page_boxes.py
+++ b/tests/test_page_boxes.py
@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2025 James R. Barlow
+# SPDX-License-Identifier: MPL-2.0
+
+from __future__ import annotations
+
+import pikepdf
+import pytest
+
+from .conftest import check_ocrmypdf
+
+page_rect = [0, 0, 612, 792]
+inset_rect = [200, 200, 612, 792]
+wh_rect = [0, 0, 412, 592]
+
+neg_rect = [-100, -100, 512, 692]
+
+mediabox_testdata = [
+    ('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
+    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
+    ('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
+    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
+    (
+        'hocr',
+        'pdfa',
+        'ccitt.pdf',
+        '--force-ocr',
+        inset_rect,
+        wh_rect,
+    ),
+    (
+        'hocr',
+        'pdf',
+        'ccitt.pdf',
+        '--force-ocr',
+        inset_rect,
+        wh_rect,
+    ),
+    ('hocr', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
+    ('hocr', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
+]
+
+
+@pytest.mark.parametrize(
+    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
+)
+def test_media_box(
+    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
+):
+    with pikepdf.open(resources / in_pdf) as pdf:
+        page = pdf.pages[0]
+        page.MediaBox = crop_to
+        pdf.save(outdir / 'cropped.pdf')
+    args = [
+        '--jobs',
+        '1',
+        '--pdf-renderer',
+        renderer,
+        '--output-type',
+        output_type,
+    ]
+    if mode:
+        args.append(mode)
+
+    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
+
+    with pikepdf.open(outdir / 'processed.pdf') as pdf:
+        page = pdf.pages[0]
+        assert page.MediaBox == crop_expected
+
+
+cropbox_testdata = [
+    ('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
+    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
+    ('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
+    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
+    (
+        'hocr',
+        'pdfa',
+        'ccitt.pdf',
+        '--force-ocr',
+        inset_rect,
+        inset_rect,
+    ),
+    (
+        'hocr',
+        'pdf',
+        'ccitt.pdf',
+        '--force-ocr',
+        inset_rect,
+        inset_rect,
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
+)
+def test_crop_box(
+    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
+):
+    with pikepdf.open(resources / in_pdf) as pdf:
+        page = pdf.pages[0]
+        page.CropBox = crop_to
+        pdf.save(outdir / 'cropped.pdf')
+    args = [
+        '--jobs',
+        '1',
+        '--pdf-renderer',
+        renderer,
+        '--output-type',
+        output_type,
+        '--optimize',
+        '0',
+    ]
+    if mode:
+        args.append(mode)
+
+    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
+
+    with pikepdf.open(outdir / 'processed.pdf') as pdf:
+        page = pdf.pages[0]
+        assert page.CropBox == crop_expected
--- a/tests/test_rotation.py
+++ b/tests/test_rotation.py
@ -51,20 +51,20 @@ def compare_images_monochrome(

    with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:
        assert reference_im.mode == test_im.mode == '1'
+        assert reference_im.size == test_im.size, "Images must be the same size"
+
+        # XOR the images: matching pixels become 0, different pixels become 1
        difference = ImageChops.logical_xor(reference_im, test_im)
-        assert difference.mode == '1'

-        histogram = difference.histogram()
-        assert (
-            len(histogram) == 256
-        ), "Expected Pillow to convert to grayscale for histogram"
-
-        # All entries other than first and last will be 0
-        count_same = histogram[0]
-        count_different = histogram[-1]
+        # Count matching pixels directly using getcolors()
+        # For a binary image, getcolors returns [(count, 0), (count, 1)] or subset
+        colors = difference.getcolors()
+        color_counts = {color: count for count, color in colors}
+        count_same = color_counts.get(0, 0)  # 0 = matching pixels (XOR result is 0)
+        count_different = color_counts.get(255, 0)  # 255 = different pixels
        total = count_same + count_different

-        return count_same / (total)
+        return count_same / total


 def test_monochrome_comparison(resources, outdir):
@ -211,7 +211,7 @@ def test_rotate_deskew_ocr_timeout(resources, outdir):
    assert cmp > 0.95


-def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle):
+def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle, cropbox=None):
    memimg = BytesIO()
    with Image.open(fspath(imagefile)) as im:
        if image_angle != 0:
@ -230,6 +230,8 @@ def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle):
    with pikepdf.open(mempdf) as pdf:
        pdf.pages[0].Rotate = page_angle
        target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
+        if cropbox:
+            pdf.pages[0].CropBox = cropbox
        pdf.save(target)
        return target

@ -284,6 +286,44 @@ def test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):
    assert 'is a' in test_text, test_text


+@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
+@pytest.mark.parametrize('renderer', ['sandwich', 'hocr'])
+@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
+def test_rotate_and_crop(
+    resources, outdir, page_rotate_angle, renderer, output_type, caplog
+):
+    cropbox = (100, 200, 1000, 800)
+    reference = make_rotate_test(
+        resources / 'typewriter.png', outdir, 'ref', 0, 0, cropbox
+    )
+    test = make_rotate_test(
+        resources / 'typewriter.png',
+        outdir,
+        'test',
+        -page_rotate_angle,
+        page_rotate_angle,
+        cropbox,
+    )
+    out = test.with_suffix('.out.pdf')
+
+    exitcode = run_ocrmypdf_api(
+        test,
+        out,
+        '-O0',
+        '--rotate-pages',
+        '--rotate-pages-threshold',
+        '0',
+        '--pdf-renderer',
+        renderer,
+        '--output-type',
+        output_type,
+        '--no-progress-bar',
+    )
+    assert exitcode == 0, caplog.text
+
+    assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.9
+
+
 def test_rasterize_rotates(resources, tmp_path):
    pm = get_plugin_manager([])

--- a/tests/test_semfree.py
+++ b/tests/test_semfree.py
@ -3,6 +3,8 @@

 from __future__ import annotations

+import sys
+
 import pytest

 from ocrmypdf.exceptions import ExitCode
@ -11,16 +13,21 @@ from .conftest import is_linux, run_ocrmypdf_api


@pytest.mark.skipif(not is_linux(), reason='semfree plugin only works on Linux')
+@pytest.mark.skipif(
+    sys.version_info >= (3, 14),
+    reason='semfree plugin only works on Python 3.13 or earlier',
+)
 def test_semfree(resources, outpdf):
-    exitcode = run_ocrmypdf_api(
-        resources / 'multipage.pdf',
-        outpdf,
-        '--skip-text',
-        '--skip-big',
-        '2',
-        '--plugin',
-        'ocrmypdf.extra_plugins.semfree',
-        '--plugin',
-        'tests/plugins/tesseract_noop.py',
-    )
-    assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)
+    with pytest.warns(DeprecationWarning, match="semfree.py is deprecated"):
+        exitcode = run_ocrmypdf_api(
+            resources / 'multipage.pdf',
+            outpdf,
+            '--skip-text',
+            '--skip-big',
+            '2',
+            '--plugin',
+            'ocrmypdf.extra_plugins.semfree',
+            '--plugin',
+            'tests/plugins/tesseract_noop.py',
+        )
+        assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)
--- a/tests/test_stdio.py
+++ b/tests/test_stdio.py
@ -48,6 +48,7 @@ def test_stdout(ocrmypdf_exec, resources, outpdf):
    assert check_pdf(output_file)


+@pytest.mark.skipif(os.name == 'nt', reason='Windows does not support /dev/null')
 def test_dev_null(resources):
    if 'COV_CORE_DATAFILE' in os.environ:
        pytest.skip("Coverage uses stdout")
--- a/tests/test_watcher.py
+++ b/tests/test_watcher.py
@ -0,0 +1,54 @@
+import datetime
+import os
+import shutil
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import pytest
+
+watchdog = pytest.importorskip('watchdog')
+
+
+@pytest.mark.parametrize('year_month', [True, False])
+def test_watcher(tmp_path, resources, year_month):
+    input_dir = tmp_path / 'input'
+    input_dir.mkdir()
+    output_dir = tmp_path / 'output'
+    output_dir.mkdir()
+    processed_dir = tmp_path / 'processed'
+    processed_dir.mkdir()
+
+    if year_month:
+        env_extra = {'OCR_OUTPUT_DIRECTORY_YEAR_MONTH': '1'}
+    else:
+        env_extra = {}
+    proc = subprocess.Popen(
+        [
+            sys.executable,
+            Path(__file__).parent.parent / 'misc' / 'watcher.py',
+            str(input_dir),
+            str(output_dir),
+            str(processed_dir),
+        ],
+        cwd=str(tmp_path),
+        env=os.environ.copy() | env_extra,
+    )
+    time.sleep(5)
+
+    shutil.copy(resources / 'trivial.pdf', input_dir / 'trivial.pdf')
+    time.sleep(5)
+
+    if year_month:
+        assert (
+            output_dir
+            / f'{datetime.date.today().year}'
+            / f'{datetime.date.today().month:02d}'
+            / 'trivial.pdf'
+        ).exists()
+    else:
+        assert (output_dir / 'trivial.pdf').exists()
+
+    proc.terminate()
+    proc.wait()
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
SuperCowProducts	8930efe787	Update README with Fedora installation instructions (#1610 ) Added instructions for Fedora users to install Tesseract language packs.	2025-12-27 01:15:45 -08:00
James R. Barlow	c540967429	docs: Update release notes	2025-12-23 15:44:44 -08:00
James R. Barlow	195344d307	Reinstate "Work around Ghostscript 10.6.0 JPEG encoding issue by forcing optimization."" This reverts commit fc30cb8903c9c91ea64e0d0ee9302dc8ebb8c178. It turns out that both fixes were necessary.	2025-12-23 15:41:34 -08:00
James R. Barlow	de63d6eac9	Merge remote-tracking branches 'origin/dependabot/github_actions/actions/download-artifact-7', 'origin/dependabot/github_actions/actions/upload-artifact-6', 'origin/dependabot/github_actions/sigstore/gh-action-sigstore-python-3.2.0' and 'origin/dependabot/github_actions/actions/checkout-6'	2025-12-23 15:06:50 -08:00
James R. Barlow	6ada11ddae	docs: Update release notes	2025-12-23 15:05:49 -08:00
James R. Barlow	fc30cb8903	Revert "Work around Ghostscript 10.6.0 JPEG encoding issue by forcing optimization." This reverts commit f4c6c8121ba8178ff3a1cb8f70037bbc3a31391b. The issue is now resolved by correcting the encoidng issue directly.	2025-12-23 15:03:51 -08:00
James R. Barlow	01a3706281	docs: Add release notes for v16.13.0	2025-12-23 15:01:22 -08:00
James R. Barlow	e613db6a82	Fix Ghostscript 10.6 JPEG corruption by repairing truncated images Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes. This adds detection and repair by comparing output images to input images and restoring the original bytes when truncation is detected. - Add warning when GS 10.6+ is used with PDF/A output - Add _repair_gs106_jpeg_corruption() to fix damaged JPEGs after Ghostscript processing - Add unit tests for the repair function	2025-12-23 14:56:24 -08:00
James R. Barlow	742a4bac17	Make rotation test more robust	2025-12-23 11:20:57 -08:00
James R. Barlow	4c1ef0b471	Also process art and bleed boxes	2025-12-23 11:20:41 -08:00
James R. Barlow	eace567f7b	Test and fix page box issues	2025-12-23 11:19:51 -08:00
dependabot[bot]	cdf956ffc4	Bump actions/download-artifact from 6 to 7 Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 6 to 7. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v6...v7) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-15 10:02:30 +00:00
dependabot[bot]	c6b21d4dea	Bump actions/upload-artifact from 5 to 6 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 5 to 6. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-15 10:02:24 +00:00
dependabot[bot]	f673da9ab9	Bump sigstore/gh-action-sigstore-python from 3.1.0 to 3.2.0 Bumps [sigstore/gh-action-sigstore-python](https://github.com/sigstore/gh-action-sigstore-python) from 3.1.0 to 3.2.0. - [Release notes](https://github.com/sigstore/gh-action-sigstore-python/releases) - [Changelog](https://github.com/sigstore/gh-action-sigstore-python/blob/main/CHANGELOG.md) - [Commits](https://github.com/sigstore/gh-action-sigstore-python/compare/v3.1.0...v3.2.0) --- updated-dependencies: - dependency-name: sigstore/gh-action-sigstore-python dependency-version: 3.2.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-08 10:02:38 +00:00
rugk	8d715c4157	docs: fix and clarify podman usage instructions (#1601 ) * docs: fix and clarify podman usage instructions * the full reference `jbarlow83/ocrmypdf-alpine` as in the other commands may fix an issue if you do not have `ocrmypdf` already downloaded locally * also clarified the command at the end for usage when SELinux is enabled * docs: clarify difference between SeLinux and rootless user mapping	2025-12-01 13:07:09 -08:00
dependabot[bot]	0f3c7765aa	Bump actions/checkout from 5 to 6 Bumps [actions/checkout](https://github.com/actions/checkout) from 5 to 6. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-11-24 11:31:33 +00:00
Chris Mayo	9dbce33ee6	Update Changelog URL (#1597 ) Renamed in: d1a45e4a ("Convert remaining rst -> md", 2025-04-17)	2025-11-16 23:10:48 -08:00
James R. Barlow	54ce09496c	v16.12.0 release notes	2025-11-11 13:48:06 -08:00
James R. Barlow	f4c6c8121b	Work around Ghostscript 10.6.0 JPEG encoding issue by forcing optimization. Not an ideal fix, but it improves an issue affecting numerous users. Fixes #1585.	2025-11-10 17:01:02 -08:00
James R. Barlow	057eaff36d	Skip devnull testing on Windows No longer seems to work - Windows Server 2025 change, perhaps? Doesn't really matter.	2025-11-10 16:57:30 -08:00
James R. Barlow	b88d63bdf7	Add Python 3.14 to test matrix	2025-11-10 16:10:01 -08:00
James R. Barlow	a385cd967d	docs: Improve ocrmypdf.api	2025-11-10 15:58:47 -08:00
James R. Barlow	2f72f8e94a	ghostscript: Disable subset fonts For at least the PDF associated with this issue, disabling subset fonts prevents Ghostscript from mangling the encoding when it is usable but not well-formed. Fixes #1592	2025-11-10 15:58:14 -08:00
James R. Barlow	ee47e986f3	docs: Improve module-level docstring for OCRmyPDF Python API Co-authored-by: aider (anthropic/claude-sonnet-4-20250514) <aider@aider.chat>	2025-11-10 10:33:26 -08:00
James R. Barlow	e44063da15	Update Dockerfile versions tesseract-ocr/alex-p does not have a Tesseract 5 for Ubuntu 25.10 so we use 25.04 for now. Ubuntu 25.04 gets us Ghostscript 10.05 which avoids issues in older versions. Remove comment about now-legacy Alpine versions not working properly. Alpine provides Ghostscript 10.05.1. Fixes #1587,	2025-11-09 15:20:55 -08:00
James R. Barlow	abc2d41e2d	Require recent pikepdf to fix check_pdf_syntax issue	2025-10-29 11:40:51 -07:00
James R. Barlow	38d60ea89b	optimize: don't put flate on large jpegs unless compression is high Putting flate on very large JPEGs can cause performance problems in PDF viewers, subjectively anyway.	2025-10-29 11:39:20 -07:00
James R. Barlow	35ec90af44	Merge remote-tracking branches 'origin/dependabot/github_actions/sigstore/gh-action-sigstore-python-3.1.0', 'origin/dependabot/github_actions/actions/upload-artifact-5' and 'origin/dependabot/github_actions/actions/download-artifact-6'	2025-10-28 13:40:08 -07:00
James R. Barlow	aa1cc8ae04	Update packages	2025-10-27 17:07:14 -07:00
dependabot[bot]	eaceb66030	Bump sigstore/gh-action-sigstore-python from 3.0.1 to 3.1.0 Bumps [sigstore/gh-action-sigstore-python](https://github.com/sigstore/gh-action-sigstore-python) from 3.0.1 to 3.1.0. - [Release notes](https://github.com/sigstore/gh-action-sigstore-python/releases) - [Changelog](https://github.com/sigstore/gh-action-sigstore-python/blob/main/CHANGELOG.md) - [Commits](https://github.com/sigstore/gh-action-sigstore-python/compare/v3.0.1...v3.1.0) --- updated-dependencies: - dependency-name: sigstore/gh-action-sigstore-python dependency-version: 3.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-10-27 11:08:50 +00:00
dependabot[bot]	b1dcc2c445	Bump actions/upload-artifact from 4 to 5 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4 to 5. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-10-27 11:02:08 +00:00
dependabot[bot]	ab3855af48	Bump actions/download-artifact from 5 to 6 Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 5 to 6. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-10-27 10:47:34 +00:00
James R. Barlow	5c6cc4031f	Merge remote-tracking branch 'origin/dependabot/github_actions/astral-sh/setup-uv-7'	2025-10-25 12:10:01 -07:00
James R. Barlow	f181307e50	v16.11.1 release notes	2025-10-16 10:59:13 +02:00
James R. Barlow	b213efb030	Account for new deskew output error message from recent Tesseract Fixes #1576	2025-10-16 09:50:03 +02:00
James R. Barlow	f59e68911f	Drop macos-13 (now unsupported by Apple)	2025-10-13 15:10:28 +02:00
dependabot[bot]	9605656a2f	Bump astral-sh/setup-uv from 6 to 7 Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 6 to 7. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/v6...v7) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-10-13 10:40:31 +00:00
James R. Barlow	599fb1a1f6	Fix test_semfree (skip Python 3.14) This feature is now deprecated and won't be fixed for Python 3.14. Instead we just use threads on platforms that don't support semaphores. Closes #1558	2025-09-14 13:02:33 -07:00
James R. Barlow	9a2c0cf6ff	v16.11.0 release notes	2025-09-12 00:08:11 -07:00
James R. Barlow	414d80fc16	Deprecate semfree and don't auto activate it Instead the standard executor will fall back to threads. semfree caused test failures with Py3.14: https://github.com/ocrmypdf/OCRmyPDF/issues/1558 In retrospect and with emerging Python tech like freethreading, semfree is becoming less necessary. We can use threads for the time being. A consequence is that performance may be lower on Lambda and Termux when we are using threads and not shelling out work.	2025-09-11 17:13:04 -07:00
James R. Barlow	7ca4ae4e16	Merge branch 'feature/pdfa-naming'	2025-09-11 16:37:53 -07:00
James R. Barlow	7e7e2f2e91	Raw value in pdfa XML block uses upper case codes, so account for this	2025-09-08 12:46:26 -07:00
clach04	d07231a7aa	Doc typo plugins.md (#1568 )	2025-09-08 12:07:51 -07:00
dependabot[bot]	0e831db9f4	Bump actions/setup-python from 5 to 6 (#1569 ) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5 to 6. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/setup-python dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-09-08 12:07:28 -07:00
5HT2	650ca1c65b	docs: Update screencast demo output to have corrected references to PDF/A compliance levels See a7b0c0df6ccdb2fa802d8af5af46c1bbe7df13da for more information	2025-08-31 20:54:08 +01:00
5HT2	a7b0c0df6c	fix(src): Refactor CLI help references to PDF/A compliance levels Please see [RFC8118 4.](https://datatracker.ietf.org/doc/html/rfc8118#section-4) for examples regarding the PDF/A compliance naming scheme. Please see [RFC8118 [ISOPDFA]](https://datatracker.ietf.org/doc/html/rfc8118#ref-ISOPDFA) for more complete information regarding the PDF/A compliance naming scheme.	2025-08-31 20:37:41 +01:00
5HT2	d735791524	fix(src): Refactor `valid_part_conforms` for PDF/A compliance levels	2025-08-31 20:32:30 +01:00
dependabot[bot]	66308c2813	Bump actions/download-artifact from 4 to 5 (#1557 ) Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 4 to 5. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-08-18 13:43:34 -07:00
dependabot[bot]	d81de57bbc	Bump actions/checkout from 4 to 5 (#1560 ) Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-08-18 13:43:10 -07:00
Alina Bürge	a9a8b39dba	Fix the use of the plugin_manager argument (#1555 )	2025-08-18 13:00:39 -07:00
Stuart Henderson	fd5b8132ae	add OpenBSD info to readme (#1554 )	2025-08-18 12:49:21 -07:00
James R. Barlow	63675c21ce	Remove PyPy from test matrix	2025-08-18 12:15:32 -07:00
James R. Barlow	6af22051a8	Avoid call to deprecated pdf.check() where possible	2025-08-13 01:15:33 -07:00
James R. Barlow	8318ebbaec	Merge branch 'main' of github.com:ocrmypdf/OCRmyPDF	2025-08-13 01:05:02 -07:00
James R. Barlow	4fc0c3a0d5	Add watcher test, such as it is	2025-08-13 01:04:58 -07:00
Christoph Dyllick-Brenzinger	74305e8741	Update batch.md (#1552 ) Add two missing available parameters for watcher.py (used with docker): - OCR_LOGLEVEL - OCR_JSON_SETTINGS	2025-08-05 14:11:55 -07:00
Máté Gyöngyösi	d6b069d3fa	Unify `--tesseract-timeout` flag syntax (#1546 ) As pointed out at https://github.com/tldr-pages/tldr/pull/17175#discussion_r2192340014.	2025-07-08 11:40:58 -07:00
James R. Barlow	194ca699a8	v16.10.4 release notes	2025-07-07 12:36:15 -07:00
James R. Barlow	175b743ffe	Fix version test	2025-07-03 11:30:05 -07:00
James R. Barlow	080b73e7c0	Merge remote-tracking branch 'origin/main'	2025-07-03 09:22:20 -07:00
James R. Barlow	df6079c06d	Merge remote-tracking branch 'origin/dependabot/github_actions/sigstore/gh-action-sigstore-python-3.0.1'	2025-07-03 09:21:44 -07:00
James R. Barlow	45cf92f40b	xfail Python logging bug in 3.13.3/4	2025-07-03 09:21:31 -07:00
dependabot[bot]	5b1900beec	Bump sigstore/gh-action-sigstore-python from 3.0.0 to 3.0.1 (#1541 ) Bumps [sigstore/gh-action-sigstore-python](https://github.com/sigstore/gh-action-sigstore-python) from 3.0.0 to 3.0.1. - [Release notes](https://github.com/sigstore/gh-action-sigstore-python/releases) - [Changelog](https://github.com/sigstore/gh-action-sigstore-python/blob/main/CHANGELOG.md) - [Commits](https://github.com/sigstore/gh-action-sigstore-python/compare/v3.0.0...v3.0.1) --- updated-dependencies: - dependency-name: sigstore/gh-action-sigstore-python dependency-version: 3.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-07-03 00:40:08 -07:00
dependabot[bot]	c0208f0da1	Bump sigstore/gh-action-sigstore-python from 3.0.0 to 3.0.1 Bumps [sigstore/gh-action-sigstore-python](https://github.com/sigstore/gh-action-sigstore-python) from 3.0.0 to 3.0.1. - [Release notes](https://github.com/sigstore/gh-action-sigstore-python/releases) - [Changelog](https://github.com/sigstore/gh-action-sigstore-python/blob/main/CHANGELOG.md) - [Commits](https://github.com/sigstore/gh-action-sigstore-python/compare/v3.0.0...v3.0.1) --- updated-dependencies: - dependency-name: sigstore/gh-action-sigstore-python dependency-version: 3.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-06-23 12:15:26 +00:00
James R. Barlow	61163c2aa9	Fix stupid Python runtimeerror	2025-06-13 01:46:30 -07:00
James R. Barlow	332369f1b0	Adjust set_start_method decision, changing fork to forkserver for platforms other than win32, darwin	2025-06-13 01:22:01 -07:00
James R. Barlow	7ea940a3a6	v16.10.3 release notes	2025-06-13 00:28:33 -07:00
James R. Barlow	8a784d6052	Drop explicit norecursedirs setting, which we no longer need	2025-06-13 00:03:24 -07:00
James R. Barlow	5cf86a7c2e	Update uv.lock	2025-06-13 00:02:53 -07:00
James R. Barlow	3beabf55e7	Skip optimizing images with pre-blended soft masks Fixes issue [Bug]: Optimized pdf not rendering with Quartz / Core Graphics #1536	2025-06-12 23:58:43 -07:00
James R. Barlow	6f6448f286	Update dependency lockfile	2025-05-27 14:19:09 -07:00
James R. Barlow	9f6e5a48ad	Deny use of pikepdf 9.8.0 due to GlyphlessFont error	2025-05-27 12:16:19 -07:00
PunkPangolin	ee3da07710	Add appstream metainfo file + screenshot (#1462 ) * Add io.ocrmypdf.ocrmypdf.metainfo.xml * Create sample_screenshot.png * Better screenshot * Add screenshot to metainfo * Move into /misc/flatpak * Add screenshot URL * Add icon and categories to metainfo * Use installed icon instead of remote * Add keywords to metainfo, change summary closer to Flathub Guildelines	2025-05-27 00:42:47 -07:00
jbarlow	45043f6a8c	Merge pull request #1519 from ocrmypdf/dependabot/github_actions/astral-sh/setup-uv-6 Bump astral-sh/setup-uv from 5 to 6	2025-05-27 00:41:52 -07:00
James R. Barlow	b166e86216	jbig2 doc: mention pkg-config Closes #1484	2025-05-26 13:04:05 -07:00
dependabot[bot]	1e2d76b931	Bump astral-sh/setup-uv from 5 to 6 Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 5 to 6. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/v5...v6) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-04-28 11:23:43 +00:00