mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-08 13:11:17 +00:00
Compare commits
76 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8930efe787 | ||
|
|
c540967429 | ||
|
|
195344d307 | ||
|
|
de63d6eac9 | ||
|
|
6ada11ddae | ||
|
|
fc30cb8903 | ||
|
|
01a3706281 | ||
|
|
e613db6a82 | ||
|
|
742a4bac17 | ||
|
|
4c1ef0b471 | ||
|
|
eace567f7b | ||
|
|
cdf956ffc4 | ||
|
|
c6b21d4dea | ||
|
|
f673da9ab9 | ||
|
|
8d715c4157 | ||
|
|
0f3c7765aa | ||
|
|
9dbce33ee6 | ||
|
|
54ce09496c | ||
|
|
f4c6c8121b | ||
|
|
057eaff36d | ||
|
|
b88d63bdf7 | ||
|
|
a385cd967d | ||
|
|
2f72f8e94a | ||
|
|
ee47e986f3 | ||
|
|
e44063da15 | ||
|
|
abc2d41e2d | ||
|
|
38d60ea89b | ||
|
|
35ec90af44 | ||
|
|
aa1cc8ae04 | ||
|
|
eaceb66030 | ||
|
|
b1dcc2c445 | ||
|
|
ab3855af48 | ||
|
|
5c6cc4031f | ||
|
|
f181307e50 | ||
|
|
b213efb030 | ||
|
|
f59e68911f | ||
|
|
9605656a2f | ||
|
|
599fb1a1f6 | ||
|
|
9a2c0cf6ff | ||
|
|
414d80fc16 | ||
|
|
7ca4ae4e16 | ||
|
|
7e7e2f2e91 | ||
|
|
d07231a7aa | ||
|
|
0e831db9f4 | ||
|
|
650ca1c65b | ||
|
|
a7b0c0df6c | ||
|
|
d735791524 | ||
|
|
66308c2813 | ||
|
|
d81de57bbc | ||
|
|
a9a8b39dba | ||
|
|
fd5b8132ae | ||
|
|
63675c21ce | ||
|
|
6af22051a8 | ||
|
|
8318ebbaec | ||
|
|
4fc0c3a0d5 | ||
|
|
74305e8741 | ||
|
|
d6b069d3fa | ||
|
|
194ca699a8 | ||
|
|
175b743ffe | ||
|
|
080b73e7c0 | ||
|
|
df6079c06d | ||
|
|
45cf92f40b | ||
|
|
5b1900beec | ||
|
|
c0208f0da1 | ||
|
|
61163c2aa9 | ||
|
|
332369f1b0 | ||
|
|
7ea940a3a6 | ||
|
|
8a784d6052 | ||
|
|
5cf86a7c2e | ||
|
|
3beabf55e7 | ||
|
|
6f6448f286 | ||
|
|
9f6e5a48ad | ||
|
|
ee3da07710 | ||
|
|
45043f6a8c | ||
|
|
b166e86216 | ||
|
|
1e2d76b931 |
@ -1,7 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2024 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
FROM ubuntu:24.04 AS base
|
||||
FROM ubuntu:25.04 AS base
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
ENV TZ=UTC
|
||||
@ -40,7 +40,7 @@ RUN \
|
||||
WORKDIR /app
|
||||
|
||||
# Copy uv from ghcr
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.6.14 /uv /uvx /bin/
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/
|
||||
|
||||
ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
|
||||
|
||||
|
||||
@ -1,13 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2023 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
# Note: Alpine 3.20 builds tesseract with --enable-opencl, which is not
|
||||
# supported by anyone. OCRmyPDF is not compatible with Alpine 3.20.0
|
||||
# through 3.20.3. The issue is fixed in 3.21.
|
||||
# Details
|
||||
# https://gitlab.alpinelinux.org/alpine/aports/-/issues/16143
|
||||
# https://github.com/ocrmypdf/OCRmyPDF/issues/1395
|
||||
FROM alpine:3.21 AS base
|
||||
FROM alpine:3.22 AS base
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
ENV TZ=UTC
|
||||
@ -28,7 +22,7 @@ RUN apk add --no-cache \
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.6.14 /uv /uvx /bin/
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/
|
||||
|
||||
ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
|
||||
|
||||
|
||||
64
.github/workflows/build.yml
vendored
64
.github/workflows/build.yml
vendored
@ -22,37 +22,35 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-22.04, ubuntu-24.04]
|
||||
python: ["3.10", "3.11", "3.12", "3.13"]
|
||||
python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
tesseract_ppa: "ppa"
|
||||
python: "3.10"
|
||||
- os: ubuntu-24.04
|
||||
python: "pypy3.10"
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: "0.5.x"
|
||||
version: "0.9.x"
|
||||
|
||||
- name: "Set up Python"
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- name: Install Tesseract from PPA
|
||||
if: matrix.tesseract_ppa == 'ppa'
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5.3
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
|
||||
- name: Install common packages
|
||||
run: |
|
||||
@ -74,14 +72,6 @@ jobs:
|
||||
unpaper \
|
||||
zlib1g
|
||||
|
||||
- name: Install Ubuntu packages for PyPy
|
||||
if: startsWith(matrix.python, 'pypy')
|
||||
run: |
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
libxml2-dev \
|
||||
libxslt1-dev \
|
||||
pypy3-dev
|
||||
|
||||
- name: Install Python packages
|
||||
run: |
|
||||
uv sync --extra test --no-dev
|
||||
@ -111,15 +101,15 @@ jobs:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [macos-latest, macos-13] # macos-latest is arm64, macos-13 is x86_64
|
||||
python: ["3.10", "3.11", "3.12", "3.13"]
|
||||
os: [macos-latest]
|
||||
python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
@ -136,12 +126,12 @@ jobs:
|
||||
tesseract
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: "0.5.x"
|
||||
version: "0.9.x"
|
||||
|
||||
- name: "Set up Python"
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
@ -174,24 +164,24 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [windows-latest]
|
||||
python: ["3.10", "3.11", "3.12", "3.13"]
|
||||
python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: "0.5.x"
|
||||
version: "0.9.x"
|
||||
|
||||
- name: "Set up Python"
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
@ -220,20 +210,20 @@ jobs:
|
||||
name: Build sdist and wheels
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: "0.5.x"
|
||||
version: "0.9.x"
|
||||
|
||||
- name: Make wheels and sdist
|
||||
run: |
|
||||
uv build --sdist --wheel
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
- uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: artifact
|
||||
path: |
|
||||
@ -246,10 +236,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
environment: release
|
||||
permissions:
|
||||
id-token: write # mandatory for PyPI publishing
|
||||
id-token: write # mandatory for PyPI publishing
|
||||
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
- uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: artifact
|
||||
path: dist
|
||||
@ -267,13 +257,13 @@ jobs:
|
||||
contents: write
|
||||
id-token: write
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
- uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: artifact
|
||||
path: dist
|
||||
|
||||
- name: Sign the dists with Sigstore
|
||||
uses: sigstore/gh-action-sigstore-python@v3.0.0
|
||||
uses: sigstore/gh-action-sigstore-python@v3.2.0
|
||||
with:
|
||||
inputs: |
|
||||
./dist/*.tar.gz
|
||||
@ -318,7 +308,7 @@ jobs:
|
||||
- name: Set image name
|
||||
run: echo "DOCKER_IMAGE_NAME=ocrmypdf" >> $GITHUB_ENV
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
@ -366,7 +356,7 @@ jobs:
|
||||
- name: Set image name
|
||||
run: echo "DOCKER_IMAGE_NAME=ocrmypdf-alpine" >> $GITHUB_ENV
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -46,4 +46,5 @@ docs/_templates/
|
||||
docs/Makefile
|
||||
src/ocrmypdf/_version.py
|
||||
|
||||
.idea/
|
||||
.idea/
|
||||
.aider*
|
||||
|
||||
15
README.md
15
README.md
@ -74,6 +74,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl
|
||||
| macOS (nix) | ``nix-env -i ocrmypdf`` |
|
||||
| LinuxBrew | ``brew install ocrmypdf`` |
|
||||
| FreeBSD | ``pkg install py-ocrmypdf`` |
|
||||
| OpenBSD | ``pkg_add ocrmypdf`` |
|
||||
| Ubuntu Snap | ``snap install ocrmypdf`` |
|
||||
|
||||
For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.
|
||||
@ -83,17 +84,27 @@ For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/la
|
||||
OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:
|
||||
|
||||
```bash
|
||||
# Display a list of all Tesseract language packs
|
||||
apt-cache search tesseract-ocr
|
||||
|
||||
# Debian/Ubuntu users
|
||||
apt-cache search tesseract-ocr # Display a list of all Tesseract language packs
|
||||
apt-get install tesseract-ocr-chi-sim # Example: Install Chinese Simplified language pack
|
||||
|
||||
|
||||
# Arch Linux users
|
||||
pacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs
|
||||
|
||||
# OpenBSD users
|
||||
pkg_info -aQ tesseract # Display a list of all Tesseract language packs
|
||||
pkg_add tesseract-cym # Example: Install the Welsh language pack
|
||||
|
||||
# brew macOS users
|
||||
brew install tesseract-lang
|
||||
|
||||
# Fedora users
|
||||
dnf search tesseract-langpack # Display a list of all Tesseract language packs
|
||||
dnf install tesseract-langpack-ita # Example: Install the Italian language pack
|
||||
|
||||
|
||||
```
|
||||
|
||||
You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
|
||||
|
||||
@ -196,6 +196,10 @@ it to a OCRed PDF in `/output/`, and move the processed original to
|
||||
- Define environment variable `OCR_DESKEW` to apply deskew to crooked input PDFs
|
||||
* - `--env PYTHONBUFFERED=1`
|
||||
- This will force `STDOUT` to be unbuffered and allow you to see messages in docker logs
|
||||
* - `--env OCR_LOGLEVEL='DEBUG'`
|
||||
- Level of log messages
|
||||
* - `--env OCR_JSON_SETTINGS={"language":"deu+eng", "rotate_pages": true}`
|
||||
- A JSON string specifying any other arguments for `ocrmypdf.ocr`
|
||||
:::
|
||||
|
||||
This service relies on polling to check for changes to the filesystem.
|
||||
|
||||
@ -221,7 +221,7 @@ works if all you want to is to apply image processing or PDF/A
|
||||
conversion.
|
||||
|
||||
```bash
|
||||
ocrmypdf --tesseract-timeout=0 --remove-background input.pdf output.pdf
|
||||
ocrmypdf --tesseract-timeout 0 --remove-background input.pdf output.pdf
|
||||
```
|
||||
|
||||
:::{versionchanged} v14.1.0
|
||||
@ -250,7 +250,7 @@ This command also removes OCR generated by third party tools.
|
||||
You can also optimize all images without performing any OCR:
|
||||
|
||||
```bash
|
||||
ocrmypdf --tesseract-timeout=0 --optimize 3 --skip-text input.pdf output.pdf
|
||||
ocrmypdf --tesseract-timeout 0 --optimize 3 --skip-text input.pdf output.pdf
|
||||
```
|
||||
|
||||
### Process only certain pages
|
||||
|
||||
@ -104,23 +104,29 @@ docker_ocrmypdf /data/input.pdf /data/output.pdf
|
||||
|
||||
## Podman
|
||||
|
||||
Especially if you use [Podman](https://podman.io/) (or have SELinux
|
||||
enabled on your system), you may need to add `--userns keep-id` there,
|
||||
otherwise you may get access errors, because the user is otherwise not
|
||||
Especially if you use [Podman](https://podman.io/) (or use Docker in
|
||||
rootless mode), you may need to add `--userns keep-id` there,
|
||||
otherwise you may get access errors, because the user ID is otherwise not
|
||||
mapped to the same UID as on the host:
|
||||
|
||||
:::{code} bash
|
||||
alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" ocrmypdf'
|
||||
alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" jbarlow83/ocrmypdf-alpine'
|
||||
podman_ocrmypdf /data/input.pdf /data/output.pdf
|
||||
:::
|
||||
|
||||
If you use SELinux you may additionally need to add the `:Z` [suffix to
|
||||
If you have SELinux enabled, you may additionally need to add the `:Z` [suffix to
|
||||
the
|
||||
volume](https://docs.podman.io/en/stable/markdown/podman-run.1.html#volume-v-source-volume-host-dir-container-dir-options)
|
||||
or disable SELinux for the container using
|
||||
`--security-opt label=disable`, which is suggested for system files as
|
||||
they should not be re-labelled. Please refer to the „Note" section at
|
||||
the end of the linked podman documentation for details.
|
||||
the end of the linked podman documentation for details. This results in
|
||||
the following full command:
|
||||
|
||||
:::{code} bash
|
||||
alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" --security-opt label=disable jbarlow83/ocrmypdf-alpine'
|
||||
podman_ocrmypdf /data/input.pdf /data/output.pdf
|
||||
:::
|
||||
|
||||
{#docker-lang-packs}
|
||||
## Adding languages to the Docker image
|
||||
|
||||
@ -35,11 +35,12 @@ cd jbig2enc
|
||||
Dependencies include libtoolize and libleptonica, which on Ubuntu
|
||||
systems are packaged as libtool and libleptonica-dev. On Fedora (35)
|
||||
they are packaged as libtool and leptonica-devel. For this to work,
|
||||
please make sure to install `autotools`, `automake`, `libtool` and
|
||||
`leptonica` first if not already installed.
|
||||
please make sure to install `autotools`, `automake`, `libtool`, `pkg-config`
|
||||
and `leptonica` first if not already installed. Other dependencies might
|
||||
be required depending on your system.
|
||||
|
||||
:::{code} bash
|
||||
[sudo] apt install autotools-dev automake libtool libleptonica-dev
|
||||
[sudo] apt install autotools-dev automake libtool libleptonica-dev pkg-config
|
||||
:::
|
||||
|
||||
{#jbig2-lossy}
|
||||
|
||||
@ -118,7 +118,7 @@ A plugin may provide the following hooks. Hooks must be decorated with
|
||||
`ocrmypdf.hookimpl`, for example:
|
||||
|
||||
```python
|
||||
from ocrmpydf import hookimpl
|
||||
from ocrmypdf import hookimpl
|
||||
|
||||
@hookimpl
|
||||
def add_options(parser):
|
||||
|
||||
@ -25,6 +25,62 @@ about a forthcoming release that has not been tagged yet. A release is only
|
||||
official when it's tagged and posted to PyPI.
|
||||
:::
|
||||
|
||||
## v16.13.0
|
||||
|
||||
- Added detection and repair for Ghostscript 10.6 JPEG corruption. When GS 10.6
|
||||
truncates JPEG data by 1-15 bytes, OCRmyPDF now restores the original image
|
||||
bytes from the input PDF. A warning is issued when GS 10.6+ is detected.
|
||||
{issue}`1603`
|
||||
- We continue to force re-optimization of JPEGs, since this catches some issues with corruption for situations where Ghostscript modifies an image. It is likely there are still cases where we cannot mitigate all corruption issues. {issue}`1585`
|
||||
- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being
|
||||
processed correctly in some cases.
|
||||
- Documentation: clarified podman usage instructions.
|
||||
|
||||
## v16.12.0
|
||||
|
||||
- Disable Ghostscript's subset fonts feature, which was found to corrupt text in certain
|
||||
PDFs. Thanks @mnaegler for identifying this issue. {issue}`1592`
|
||||
- Users of Ghostscript 10.6.0+ reported that Ghostscript seems to generate corrupted
|
||||
JPEGs. We force re-optimization of these JPEGs to mitigate the corruption until
|
||||
Ghostscript fixes the issue. {issue}`1585`
|
||||
- OCRmyPDF now avoids applying flate compression to large JPEG images, unless maximum
|
||||
optimization is requested, since flate+DCT compression reduces performances in PDF
|
||||
viewers with large images.
|
||||
- Updated Dockerfiles to use more recent base operating systems.
|
||||
- Updated build and test matrix to include Python 3.14.
|
||||
- Minor documentation improvements.
|
||||
- pikepdf >= 10.0.0 is now required.
|
||||
|
||||
## v16.11.1
|
||||
|
||||
- Fixed issue with Tesseract changing an error message related to skew. {issue}`1576`
|
||||
- Dropped macOS 13 from build-test matrix since it is no longer supported by Apple.
|
||||
|
||||
## v16.11.0
|
||||
|
||||
- Deprecated "semfree" plugin in favor of falling back to threads if the platform
|
||||
does not support semaphores. Fixes an issue with Python 3.14.
|
||||
- Fixed references to PDF/A compliances levels to be consistent with ISO nomenclature.
|
||||
Thanks @5HT2. {issue}`1557`
|
||||
- Fixed an issue around using plugin_manager as an argument. {issue}`1555`
|
||||
- Added OpenBSD install steps to README. {issue}`1554`
|
||||
- Removed PyPy from test matrix due to declining support in third party libraries.
|
||||
- Documentation improvements.
|
||||
|
||||
## v16.10.4
|
||||
|
||||
- Corrected build errors in Python 3.13.3 and 3.13.4.
|
||||
|
||||
## v16.10.3 (not released)
|
||||
|
||||
- Blocked optimization of images with pre-blended soft masks. {issue}`1536`
|
||||
- Fixed warning from hypothesis on running tests.
|
||||
- Release incomplete due to new test failures in Python 3.13.3 and 3.13.4.
|
||||
|
||||
## v16.10.2
|
||||
|
||||
- Blacklist pikepdf 9.8.0 due to an incompatible change.
|
||||
|
||||
## v16.10.1
|
||||
|
||||
- No changes affecting OCRmyPDF functionality for command line end users.
|
||||
|
||||
63
misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml
Normal file
63
misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml
Normal file
@ -0,0 +1,63 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<component type="console-application">
|
||||
<id>io.ocrmypdf.ocrmypdf</id>
|
||||
|
||||
<name>OCRmyPDF</name>
|
||||
<summary>Adds an OCR text layer to scanned PDF files, allowing them to be searched</summary>
|
||||
|
||||
<developer id="io.ocrmypdf">
|
||||
<name>OCRmyPDF Developers</name>
|
||||
</developer>
|
||||
|
||||
<url type="homepage">https://github.com/ocrmypdf/ocrmypdf</url>
|
||||
<url type="bugtracker">https://github.com/ocrmypdf/OCRmyPDF/issues</url>
|
||||
|
||||
<content_rating type="oars-1.1" />
|
||||
|
||||
<metadata_license>CC0-1.0</metadata_license>
|
||||
<project_license>MPL-2.0</project_license>
|
||||
|
||||
<description>
|
||||
<ul>
|
||||
<li>Generates a searchable PDF/A file from a regular PDF</li>
|
||||
<li>Places OCR text accurately below the image to ease copy / paste</li>
|
||||
<li>Keeps the exact resolution of the original embedded images</li>
|
||||
<li>When possible, inserts OCR information as a lossless operation without disrupting any other content</li>
|
||||
<li>Optimizes PDF images, often producing files smaller than the input file If requested, deskews and/or cleans the image before performing OCR</li>
|
||||
<li>Validates input and output files</li>
|
||||
<li>Distributes work across all available CPU cores</li>
|
||||
<li>Uses Tesseract OCR engine to recognize more than 100 languages</li>
|
||||
<li>Keeps your private data private</li>
|
||||
<li>Scales properly to handle files with thousands of pages</li>
|
||||
<li>Battle-tested on millions of PDFs</li>
|
||||
</ul>
|
||||
</description>
|
||||
|
||||
<provides>
|
||||
<binary>ocrmypdf</binary>
|
||||
</provides>
|
||||
|
||||
<icon type="stock">io.ocrmypdf.ocrmypdf</icon>
|
||||
|
||||
<screenshots>
|
||||
<screenshot type="default">
|
||||
<image>https://raw.githubusercontent.com/ocrmypdf/OCRmyPDF/f7ad5f16bd0340b0b1803dada0c02f9f40542bd8/misc/flatpak/sample_screenshot.png</image>
|
||||
<caption>Sample usage of OCRmyPDF</caption>
|
||||
</screenshot>
|
||||
</screenshots>
|
||||
|
||||
<categories>
|
||||
<category>Office</category>
|
||||
<category>Utility</category>
|
||||
</categories>
|
||||
|
||||
<keywords>
|
||||
<keyword>ocr</keyword>
|
||||
<keyword>pdf</keyword>
|
||||
<keyword>tool</keyword>
|
||||
</keywords>
|
||||
|
||||
<releases>
|
||||
<release version="16.8.0" date="2025-01-05"/>
|
||||
</releases>
|
||||
</component>
|
||||
BIN
misc/flatpak/sample_screenshot.png
Normal file
BIN
misc/flatpak/sample_screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 166 KiB |
@ -60,6 +60,6 @@
|
||||
[8.280789, "o", "\rRecompressing JPEGs: 0image [00:00, ?image/s]\rRecompressing JPEGs: 0image [00:00, ?image/s]\r\n\rDeflating JPEGs: 0%| | 0/4 [00:00<?, ?image/s]\rDeflating JPEGs: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 238.28image/s]\r\n"]
|
||||
[8.28149, "o", "\rJBIG2: 0item [00:00, ?item/s]\rJBIG2: 0item [00:00, ?item/s]\r\n"]
|
||||
[8.289998, "o", "Image optimization ratio: 1.01 savings: 1.3%\r\nTotal file size ratio: 1.02 savings: 1.6%\r\n"]
|
||||
[8.291209, "o", "Output file is a PDF/A-2B (as expected)\r\n"]
|
||||
[8.291209, "o", "Output file is a PDF/A-2b (as expected)\r\n"]
|
||||
[8.361316, "o", "\u001b[2m⏎\u001b(B\u001b[m \r⏎ \r\u001b[K\u001b[?2004h\u001b]0;fish /home/jb/src/ocrmypdf/tests/resources\u0007\u001b[30m\u001b(B\u001b[m> \u001b[K\r\u001b[C\u001b[C"]
|
||||
[8.862206, "o", "\r\n\u001b[30m\u001b(B\u001b[m\u001b[30m\u001b(B\u001b[m\u001b[?2004l"]
|
||||
|
||||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 29 KiB After Width: | Height: | Size: 29 KiB |
@ -5,8 +5,7 @@
|
||||
|
||||
"""Watch a directory for new PDFs and OCR them."""
|
||||
|
||||
# Do not enable annotations!
|
||||
# https://github.com/tiangolo/typer/discussions/598
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
@ -278,11 +277,11 @@ def main(
|
||||
f"Output Directory Year & Month: {output_dir_year_month}\n"
|
||||
f"Archive Directory: {archive_dir}"
|
||||
)
|
||||
log.debug(
|
||||
log.info(
|
||||
f"INPUT_DIRECTORY: {input_dir}\n"
|
||||
f"OUTPUT_DIRECTORY: {output_dir}\n"
|
||||
f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
|
||||
f"ARCHIVE_DIRECTORY: {archive_dir}\n"
|
||||
f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
|
||||
f"ON_SUCCESS_DELETE: {on_success_delete}\n"
|
||||
f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
|
||||
f"DESKEW: {deskew}\n"
|
||||
|
||||
@ -17,7 +17,7 @@ dependencies = [
|
||||
"packaging>=20",
|
||||
"pdfminer.six>=20220319",
|
||||
"pi-heif", # Heif image format - maintainers: if this is removed, it will NOT break
|
||||
"pikepdf>=8.10.1",
|
||||
"pikepdf>=10",
|
||||
"Pillow>=10.0.1",
|
||||
"pluggy>=1",
|
||||
"rich>=13",
|
||||
@ -45,15 +45,10 @@ keywords = ["PDF", "OCR", "optical character recognition", "PDF/A", "scanning"]
|
||||
Documentation = "https://ocrmypdf.readthedocs.io/"
|
||||
Source = "https://github.com/ocrmypdf/OCRmyPDF"
|
||||
Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
|
||||
Changelog = "https://github.com/ocrmypdf/OCRmyPDF/docs/release_notes.rst"
|
||||
Changelog = "https://github.com/ocrmypdf/OCRmyPDF/docs/release_notes.md"
|
||||
|
||||
[project.optional-dependencies]
|
||||
docs = [
|
||||
"myst-parser>=4.0.1",
|
||||
"sphinx",
|
||||
"sphinx-issues",
|
||||
"sphinx-rtd-theme",
|
||||
]
|
||||
docs = ["myst-parser>=4.0.1", "sphinx", "sphinx-issues", "sphinx-rtd-theme"]
|
||||
extended_test = ["PyMuPDF>=1.19.1"]
|
||||
test = [
|
||||
"coverage[toml]>=6.2",
|
||||
@ -107,7 +102,6 @@ exclude_lines = [
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "6.0"
|
||||
norecursedirs = ["lib", ".pc", ".git", "venv", "output", "cache", "resources"]
|
||||
testpaths = ["tests"]
|
||||
addopts = "-n auto"
|
||||
markers = ["slow"]
|
||||
|
||||
@ -78,6 +78,7 @@ def run(args=None):
|
||||
|
||||
if __name__ == '__main__':
|
||||
multiprocessing.freeze_support()
|
||||
if os.name == 'posix':
|
||||
multiprocessing.set_start_method('forkserver')
|
||||
if sys.platform not in ('win32', 'darwin'):
|
||||
with suppress(RuntimeError):
|
||||
multiprocessing.set_start_method('forkserver')
|
||||
sys.exit(run())
|
||||
|
||||
@ -286,6 +286,7 @@ def generate_pdfa(
|
||||
+ compression_args
|
||||
+ [
|
||||
"-dJPEGQ=95",
|
||||
"-dSubsetFonts=false", # Prevents GS from messing up some encodings
|
||||
f"-dPDFA={pdfa_part}",
|
||||
"-dPDFACompatibilityPolicy=1",
|
||||
"-o",
|
||||
|
||||
@ -186,6 +186,18 @@ def get_orientation(
|
||||
return orient_conf
|
||||
|
||||
|
||||
def _is_empty_page_error(exc):
|
||||
if b'Empty page!!' in exc.output: # Tesseract 4.x
|
||||
return True
|
||||
|
||||
return exc.returncode == 1 and (
|
||||
# Tesseract 5.0-5.4 or so
|
||||
exc.output == b''
|
||||
# Tesseract 5.5+
|
||||
or exc.output.startswith(b"Error in boxClipToRectangle: box outside rectangle")
|
||||
)
|
||||
|
||||
|
||||
def get_deskew(
|
||||
input_file: Path, languages: list[str], engine_mode: int | None, timeout: float
|
||||
) -> float:
|
||||
@ -204,11 +216,9 @@ def get_deskew(
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(e.stdout)
|
||||
tesseract_log_output(e.stderr)
|
||||
if b'Empty page!!' in e.output or (
|
||||
e.output == b'' and e.returncode == 1
|
||||
): # Not enough info for a skew angle - Tess 4 and 5 return different errors
|
||||
if _is_empty_page_error(e):
|
||||
# Not enough info for a skew angle
|
||||
return 0.0
|
||||
|
||||
raise SubprocessOutputError() from e
|
||||
|
||||
parsed = _parse_tesseract_output(p.stdout)
|
||||
|
||||
@ -39,6 +39,7 @@ from ocrmypdf.hocrtransform import DebugRenderOptions, HocrTransform
|
||||
from ocrmypdf.hocrtransform._font import Courier
|
||||
from ocrmypdf.pdfa import generate_pdfa_ps
|
||||
from ocrmypdf.pdfinfo import Colorspace, Encoding, PageInfo, PdfInfo
|
||||
from ocrmypdf.pdfinfo.info import FloatRect
|
||||
from ocrmypdf.pluginspec import OrientationConfidence
|
||||
|
||||
try:
|
||||
@ -549,7 +550,9 @@ def rasterize(
|
||||
|
||||
device = colorspaces[device_idx]
|
||||
|
||||
log.debug(f"Rasterize with {device}, rotation {correction}")
|
||||
log.debug(
|
||||
f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
|
||||
)
|
||||
|
||||
canvas_dpi, page_dpi = calculate_raster_dpi(page_context)
|
||||
|
||||
@ -830,6 +833,23 @@ def _offset_rect(rect: tuple[float, float, float, float], offset: tuple[float, f
|
||||
)
|
||||
|
||||
|
||||
def _adjust_pagebox(
|
||||
page: pikepdf.Page,
|
||||
media_box: FloatRect,
|
||||
name: pikepdf.Name,
|
||||
target_box: FloatRect,
|
||||
offset: tuple[float, float],
|
||||
swap_axis: bool,
|
||||
):
|
||||
if media_box == target_box:
|
||||
return
|
||||
box = _offset_rect(target_box, offset)
|
||||
if swap_axis:
|
||||
box = box[1], box[0], box[3], box[2]
|
||||
page[name] = box
|
||||
log.debug(f"{str(name)} = {target_box}")
|
||||
|
||||
|
||||
def fix_pagepdf_boxes(
|
||||
infile: Path | BinaryIO,
|
||||
out_file: Path,
|
||||
@ -840,7 +860,7 @@ def fix_pagepdf_boxes(
|
||||
|
||||
The single page PDF is created with a normal MediaBox with its lower left corner
|
||||
at (0, 0). infile is the single page PDF. page_context.mediabox has the original
|
||||
file's mediabox, which may have a different origin. We needto adjust the other
|
||||
file's mediabox, which may have a different origin. We need to adjust the other
|
||||
boxes in the single page PDF to match the effect they had on the original page.
|
||||
|
||||
When correcting page rotation, we create a single page PDF that is correctly
|
||||
@ -854,18 +874,25 @@ def fix_pagepdf_boxes(
|
||||
"""
|
||||
with pikepdf.open(infile) as pdf:
|
||||
for page in pdf.pages:
|
||||
# page.BleedBox = page_context.pageinfo.bleedbox
|
||||
# page.ArtBox = page_context.pageinfo.artbox
|
||||
log.debug(
|
||||
f"initial mediabox={page.MediaBox} and pageinfo "
|
||||
f"mediabox={page_context.pageinfo.mediabox}"
|
||||
)
|
||||
mediabox = page_context.pageinfo.mediabox
|
||||
offset = mediabox[0], mediabox[1]
|
||||
cropbox = _offset_rect(page_context.pageinfo.cropbox, offset)
|
||||
trimbox = _offset_rect(page_context.pageinfo.trimbox, offset)
|
||||
|
||||
offset = -mediabox[0], -mediabox[1]
|
||||
if swap_axis:
|
||||
cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
|
||||
trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
|
||||
page.CropBox = cropbox
|
||||
page.TrimBox = trimbox
|
||||
mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
|
||||
boxes = ['CropBox', 'TrimBox', 'ArtBox', 'BleedBox']
|
||||
for box_name in boxes:
|
||||
_adjust_pagebox(
|
||||
page,
|
||||
mediabox,
|
||||
pikepdf.Name(f"/{box_name}"),
|
||||
getattr(page_context.pageinfo, box_name.lower()),
|
||||
offset,
|
||||
swap_axis,
|
||||
)
|
||||
|
||||
pdf.save(out_file)
|
||||
return out_file
|
||||
|
||||
|
||||
@ -73,19 +73,10 @@ class OcrmypdfPluginManager(pluggy.PluginManager):
|
||||
module = importlib.import_module(name)
|
||||
self.register(module)
|
||||
|
||||
# 2. Install semfree if needed
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from multiprocessing.synchronize import SemLock
|
||||
|
||||
del SemLock
|
||||
except ImportError:
|
||||
self.register(importlib.import_module('ocrmypdf.extra_plugins.semfree'))
|
||||
|
||||
# 3. Register setuptools plugins
|
||||
# 2. Register setuptools plugins
|
||||
self.load_setuptools_entrypoints('ocrmypdf')
|
||||
|
||||
# 4. Register plugins specified on command line
|
||||
# 3. Register plugins specified on command line
|
||||
for name in self.__plugins:
|
||||
if isinstance(name, Path) or name.endswith('.py'):
|
||||
# Import by filename
|
||||
|
||||
@ -1,7 +1,41 @@
|
||||
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
"""Functions for using ocrmypdf as an API."""
|
||||
"""Python API for OCRmyPDF.
|
||||
|
||||
This module provides the main Python API for OCRmyPDF, allowing you to perform
|
||||
OCR operations programmatically without using the command line interface.
|
||||
|
||||
Main Functions:
|
||||
ocr(): The primary function for OCR processing. Takes an input PDF or image
|
||||
file and produces an OCR'd PDF with searchable text.
|
||||
|
||||
configure_logging(): Set up logging to match the command line interface
|
||||
behavior, with support for progress bars and colored output.
|
||||
|
||||
Experimental Functions:
|
||||
_pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for
|
||||
manual editing before final PDF generation.
|
||||
|
||||
_hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after
|
||||
manual text corrections.
|
||||
|
||||
The API maintains thread safety through internal locking since OCRmyPDF uses
|
||||
global state for plugins. Only one OCR operation can run per Python process
|
||||
at a time. For parallel processing, use multiple Python processes.
|
||||
|
||||
Example:
|
||||
import ocrmypdf
|
||||
|
||||
# Configure logging (optional)
|
||||
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
|
||||
|
||||
# Perform OCR
|
||||
ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')
|
||||
|
||||
For detailed parameter documentation, see the ocr() function docstring and
|
||||
the equivalent command line parameters in the OCRmyPDF documentation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@ -357,7 +391,7 @@ def ocr( # noqa: D417
|
||||
create_options_kwargs = {
|
||||
k: v
|
||||
for k, v in locals().items()
|
||||
if k not in {'input_file', 'output_file', 'kwargs'}
|
||||
if k not in {'input_file', 'output_file', 'kwargs', 'plugin_manager'}
|
||||
}
|
||||
create_options_kwargs.update(kwargs)
|
||||
|
||||
|
||||
@ -96,6 +96,31 @@ def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
|
||||
return
|
||||
|
||||
|
||||
def setup_executor(use_threads: bool) -> tuple[Queue, Executor, WorkerInit]:
|
||||
if not use_threads:
|
||||
# Some execution environments like AWS Lambda and Termux do not support
|
||||
# semaphores. Check if semaphore support is available, and if not, fall back
|
||||
# to using threads.
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from multiprocessing.synchronize import SemLock
|
||||
|
||||
del SemLock
|
||||
except ImportError:
|
||||
use_threads = True
|
||||
|
||||
if use_threads:
|
||||
loq_queue = queue.Queue(-1)
|
||||
executor_class = ThreadPoolExecutor
|
||||
initializer = thread_init
|
||||
else:
|
||||
loq_queue = multiprocessing.Queue(-1)
|
||||
executor_class = ProcessPoolExecutor
|
||||
initializer = process_init
|
||||
|
||||
return loq_queue, executor_class, initializer
|
||||
|
||||
|
||||
class StandardExecutor(Executor):
|
||||
"""Standard OCRmyPDF concurrent task executor."""
|
||||
|
||||
@ -110,14 +135,7 @@ class StandardExecutor(Executor):
|
||||
task_arguments: Iterable,
|
||||
task_finished: Callable,
|
||||
):
|
||||
if use_threads:
|
||||
log_queue: Queue = queue.Queue(-1)
|
||||
executor_class: FuturesExecutorClass = ThreadPoolExecutor
|
||||
initializer: WorkerInit = thread_init
|
||||
else:
|
||||
log_queue = multiprocessing.Queue(-1)
|
||||
executor_class = ProcessPoolExecutor
|
||||
initializer = process_init
|
||||
log_queue, executor_class, initializer = setup_executor(use_threads)
|
||||
|
||||
# Regardless of whether we use_threads for worker processes, the log_listener
|
||||
# must be a thread. Make sure we create the listener after the worker pool,
|
||||
|
||||
@ -5,8 +5,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from packaging.version import Version
|
||||
from pikepdf import Name, Pdf, Stream
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
from ocrmypdf._exec import ghostscript
|
||||
@ -74,6 +76,13 @@ def check_options(options):
|
||||
"use --force-ocr to discard existing text."
|
||||
)
|
||||
|
||||
if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
|
||||
"images. OCRmyPDF will attempt to mitigate, but this version is "
|
||||
"strongly not recommended. Please upgrade to a newer version. "
|
||||
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
|
||||
)
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES:
|
||||
@ -116,6 +125,144 @@ def rasterize_pdf_page(
|
||||
return output_file
|
||||
|
||||
|
||||
def _collect_dctdecode_images(pdf: Pdf) -> dict[tuple, list[tuple[Stream, bytes]]]:
|
||||
"""Collect all DCTDecode (JPEG) images from a PDF.
|
||||
|
||||
Returns a dict mapping image signatures to a list of (stream, raw_bytes) tuples.
|
||||
The signature is (Width, Height, Filter, BitsPerComponent, ColorSpace).
|
||||
"""
|
||||
images: dict[tuple, list[tuple[Stream, bytes]]] = {}
|
||||
|
||||
def get_colorspace_key(obj):
|
||||
"""Get a hashable key for the colorspace."""
|
||||
cs = obj.get(Name.ColorSpace)
|
||||
if cs is None:
|
||||
return None
|
||||
if isinstance(cs, Name):
|
||||
return str(cs)
|
||||
# For array colorspaces like [/ICCBased ...], use the first element
|
||||
try:
|
||||
return str(cs[0]) if len(cs) > 0 else str(cs)
|
||||
except (TypeError, KeyError):
|
||||
return str(cs)
|
||||
|
||||
def process_xobject_dict(xobjects, depth=0):
|
||||
"""Process an XObject dictionary for DCTDecode images."""
|
||||
if xobjects is None:
|
||||
return
|
||||
if depth > 10:
|
||||
log.warning("Recursion depth exceeded in _collect_dctdecode_images")
|
||||
return
|
||||
for key in xobjects.keys():
|
||||
obj = xobjects[key]
|
||||
if obj is None:
|
||||
continue
|
||||
# Check if it's an image with DCTDecode
|
||||
if obj.get(Name.Subtype) == Name.Image:
|
||||
filt = obj.get(Name.Filter)
|
||||
if filt == Name.DCTDecode:
|
||||
sig = (
|
||||
int(obj.get(Name.Width, 0)),
|
||||
int(obj.get(Name.Height, 0)),
|
||||
str(filt),
|
||||
int(obj.get(Name.BitsPerComponent, 0)),
|
||||
get_colorspace_key(obj),
|
||||
)
|
||||
raw_bytes = obj.read_raw_bytes()
|
||||
if sig not in images:
|
||||
images[sig] = []
|
||||
images[sig].append((obj, raw_bytes))
|
||||
# Recurse into Form XObjects
|
||||
elif obj.get(Name.Subtype) == Name.Form:
|
||||
if Name.Resources in obj:
|
||||
res = obj[Name.Resources]
|
||||
if Name.XObject in res:
|
||||
process_xobject_dict(res[Name.XObject], depth=depth + 1)
|
||||
|
||||
for page in pdf.pages:
|
||||
if Name.Resources not in page:
|
||||
continue
|
||||
resources = page[Name.Resources]
|
||||
if Name.XObject not in resources:
|
||||
continue
|
||||
process_xobject_dict(resources[Name.XObject])
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def _repair_gs106_jpeg_corruption(
|
||||
input_pdf_path: Path,
|
||||
output_pdf_path: Path,
|
||||
) -> bool:
|
||||
"""Repair JPEG corruption caused by Ghostscript 10.6.
|
||||
|
||||
Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes.
|
||||
This function detects and repairs such corruption by copying the
|
||||
original JPEG bytes from the input PDF.
|
||||
|
||||
Returns True if any repairs were made.
|
||||
"""
|
||||
repaired_count = 0
|
||||
first_error_logged = False
|
||||
|
||||
with (
|
||||
Pdf.open(input_pdf_path) as input_pdf,
|
||||
Pdf.open(output_pdf_path, allow_overwriting_input=True) as output_pdf,
|
||||
):
|
||||
# Collect all DCTDecode images from both PDFs
|
||||
input_images = _collect_dctdecode_images(input_pdf)
|
||||
output_images = _collect_dctdecode_images(output_pdf)
|
||||
|
||||
# For each output image, try to find a corresponding input image
|
||||
for sig, output_list in output_images.items():
|
||||
if sig not in input_images:
|
||||
continue
|
||||
input_list = input_images[sig]
|
||||
|
||||
for output_stream, output_bytes in output_list:
|
||||
# Try to find a matching input image
|
||||
for _input_stream, input_bytes in input_list:
|
||||
input_len = len(input_bytes)
|
||||
output_len = len(output_bytes)
|
||||
|
||||
# Check if output is 1-15 bytes shorter
|
||||
diff = input_len - output_len
|
||||
if not (1 <= diff <= 15):
|
||||
continue
|
||||
|
||||
# Check if the bytes are identical up to the truncation point
|
||||
if output_bytes != input_bytes[:output_len]:
|
||||
continue
|
||||
|
||||
# This is a corrupt image - repair it
|
||||
if not first_error_logged:
|
||||
log.error(
|
||||
"Ghostscript 10.6 JPEG corruption detected. "
|
||||
"Repairing damaged images from original PDF."
|
||||
)
|
||||
first_error_logged = True
|
||||
log.warning(
|
||||
f"Replacing corrupt JPEG image "
|
||||
f"({sig[0]}x{sig[1]}, {diff} bytes truncated)"
|
||||
)
|
||||
|
||||
# Write the original bytes back to the output stream
|
||||
output_stream.write(
|
||||
input_bytes,
|
||||
filter=Name.DCTDecode,
|
||||
)
|
||||
repaired_count += 1
|
||||
break # Move to next output image
|
||||
|
||||
if repaired_count > 0:
|
||||
output_pdf.save(output_pdf_path)
|
||||
log.info(
|
||||
f"Repaired {repaired_count} JPEG image(s) corrupted by Ghostscript"
|
||||
)
|
||||
|
||||
return repaired_count > 0
|
||||
|
||||
|
||||
@hookimpl
|
||||
def generate_pdfa(
|
||||
pdf_pages,
|
||||
@ -138,4 +285,11 @@ def generate_pdfa(
|
||||
progressbar_class=progressbar_class,
|
||||
stop_on_error=stop_on_soft_error,
|
||||
)
|
||||
|
||||
# Repair JPEG corruption caused by Ghostscript 10.6.x
|
||||
gs_version = ghostscript.version()
|
||||
if gs_version >= Version('10.6.0') and len(pdf_pages) == 1:
|
||||
input_pdf = Path(pdf_pages[0])
|
||||
_repair_gs106_jpeg_corruption(input_pdf, Path(output_file))
|
||||
|
||||
return output_file
|
||||
|
||||
@ -196,8 +196,8 @@ Online documentation is located at:
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' minimizes changes "
|
||||
"to the input file. 'pdf-a1' creates a "
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A3-b file. 'none' will produce no output, which may be helpful if "
|
||||
"PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A-3b file. 'none' will produce no output, which may be helpful if "
|
||||
"only the --sidecar is desired.",
|
||||
)
|
||||
|
||||
|
||||
@ -2,8 +2,4 @@
|
||||
#
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
"""Extra plugins. These are not automatically inserted when ocrmypdf is run.
|
||||
|
||||
You can use these plugins by specifying them on the command line, e.g.:
|
||||
ocrmypdf --plugin ocrmypdf.extra_plugins.semfree ...
|
||||
"""
|
||||
"""Extra plugins. These are not automatically inserted when ocrmypdf is run."""
|
||||
|
||||
@ -13,6 +13,9 @@ worker communicates only with the main process.
|
||||
This is not without drawbacks. If the tasks are not "even" in size, which cannot
|
||||
be guaranteed, some workers may end up with too much work while others are idle.
|
||||
It is less efficient than the standard implementation, so not the default.
|
||||
|
||||
This module is deprecated and will be removed in a future release. The standard
|
||||
executor will fall back to threads in these environments.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@ -20,6 +23,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import logging.handlers
|
||||
import signal
|
||||
import warnings
|
||||
from collections.abc import Callable, Iterable, Iterator
|
||||
from contextlib import suppress
|
||||
from enum import Enum, auto
|
||||
@ -32,6 +36,11 @@ from ocrmypdf._concurrent import NullProgressBar
|
||||
from ocrmypdf.exceptions import InputFileError
|
||||
from ocrmypdf.helpers import remove_all_log_handlers
|
||||
|
||||
warnings.warn(
|
||||
"semfree.py is deprecated and will be removed in a future release.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
"""Implement basic IPC messaging."""
|
||||
|
||||
@ -270,7 +270,7 @@ def check_pdf(input_file: Path) -> bool:
|
||||
with pdf:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', message=r'pikepdf.*JBIG2.*')
|
||||
messages = pdf.check()
|
||||
messages = pdf.check_pdf_syntax()
|
||||
success = True
|
||||
for msg in messages:
|
||||
if 'error' in msg.lower():
|
||||
|
||||
@ -17,6 +17,7 @@ from typing import Any, NamedTuple, NewType
|
||||
from zlib import compress
|
||||
|
||||
import img2pdf
|
||||
from packaging.version import Version
|
||||
from pikepdf import (
|
||||
Dictionary,
|
||||
Name,
|
||||
@ -32,7 +33,7 @@ from pikepdf.models.image import HifiPrintImageNotTranscodableError
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf._concurrent import Executor, SerialExecutor
|
||||
from ocrmypdf._exec import jbig2enc, pngquant
|
||||
from ocrmypdf._exec import ghostscript, jbig2enc, pngquant
|
||||
from ocrmypdf._jobcontext import PdfContext
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf.exceptions import OutputFileAccessError
|
||||
@ -42,6 +43,7 @@ log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_JPEG_QUALITY = 75
|
||||
DEFAULT_PNG_QUALITY = 70
|
||||
FLATE_JPEG_THRESHOLD = 10000
|
||||
|
||||
|
||||
Xref = NewType('Xref', int)
|
||||
@ -126,6 +128,13 @@ def extract_image_filter(
|
||||
if Name.Decode in image:
|
||||
log.debug(f"xref {xref}: skipping image with Decode table")
|
||||
return None # Don't mess with custom Decode tables
|
||||
if image.get(Name.SMask, Dictionary()).get(Name.Matte, None) is not None:
|
||||
# https://github.com/ocrmypdf/OCRmyPDF/issues/1536
|
||||
# Do not attempt to optimize images that have a SMask with a Matte.
|
||||
# That means alpha channel pre-blending is used, and we're not prepared
|
||||
# to deal with the complexities of that.
|
||||
log.debug(f"xref {xref}: skipping image whose SMask has Matte")
|
||||
return None
|
||||
|
||||
return pim, filtdp
|
||||
|
||||
@ -182,6 +191,16 @@ def extract_image_jbig2(
|
||||
return None
|
||||
|
||||
|
||||
def _should_optimize_jpeg(options, filtdp):
|
||||
if options.optimize >= 2:
|
||||
return True
|
||||
if options.optimize < 2 and ghostscript.version() >= Version('10.6.0'):
|
||||
# Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.
|
||||
# To resolve this, re-optimize the JPEG anyway.
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_image_generic(
|
||||
*, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
|
||||
) -> XrefExt | None:
|
||||
@ -195,15 +214,7 @@ def extract_image_generic(
|
||||
if pim.bits_per_component == 1:
|
||||
return None
|
||||
|
||||
if filtdp[0] == Name.DCTDecode and options.optimize >= 2:
|
||||
# This is a simple heuristic derived from some training data, that has
|
||||
# about a 70% chance of guessing whether the JPEG is high quality,
|
||||
# and possibly recompressible, or not. The number itself doesn't mean
|
||||
# anything.
|
||||
# bytes_per_pixel = int(raw_jpeg.Length) / (w * h)
|
||||
# jpeg_quality_estimate = 117.0 * (bytes_per_pixel ** 0.213)
|
||||
# if jpeg_quality_estimate < 65:
|
||||
# return None
|
||||
if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):
|
||||
try:
|
||||
imgname = root / f'{xref:08d}'
|
||||
with imgname.open('wb') as f:
|
||||
@ -521,7 +532,19 @@ def _find_deflatable_jpeg(
|
||||
return None
|
||||
_pim, filtdp = result
|
||||
|
||||
if filtdp[0] == Name.DCTDecode and not filtdp[1] and options.optimize >= 1:
|
||||
if (
|
||||
filtdp[0] == Name.DCTDecode
|
||||
and not filtdp[1]
|
||||
and (
|
||||
(
|
||||
# Don't flate very large images because it will slow down PDF viewers
|
||||
1 <= options.optimize <= 2
|
||||
and image.get(Name.Width, 0) < FLATE_JPEG_THRESHOLD
|
||||
and image.get(Name.Height, 0) < FLATE_JPEG_THRESHOLD
|
||||
)
|
||||
or options.optimize == 3
|
||||
)
|
||||
):
|
||||
return XrefExt(xref, '.memory')
|
||||
|
||||
return None
|
||||
|
||||
@ -120,10 +120,13 @@ def file_claims_pdfa(filename: Path):
|
||||
'output': 'pdf',
|
||||
'conformance': 'No PDF/A metadata in XMP',
|
||||
}
|
||||
valid_part_conforms = {'1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U'}
|
||||
conformance = f'PDF/A-{pdfmeta.pdfa_status}'
|
||||
valid_part_conforms = {'1a', '1b', '2a', '2b', '2u', '3a', '3b', '3u'}
|
||||
# Raw value in XMP metadata returned by pikepdf is uppercase, but ISO
|
||||
# uses lower case for conformance levels.
|
||||
pdfa_status_iso = pdfmeta.pdfa_status.lower()
|
||||
conformance = f'PDF/A-{pdfa_status_iso}'
|
||||
pdfa_dict: dict[str, str | bool] = {}
|
||||
if pdfmeta.pdfa_status in valid_part_conforms:
|
||||
if pdfa_status_iso in valid_part_conforms:
|
||||
pdfa_dict['pass'] = True
|
||||
pdfa_dict['output'] = 'pdfa'
|
||||
pdfa_dict['conformance'] = conformance
|
||||
|
||||
@ -901,8 +901,8 @@ class PageInfo:
|
||||
width_pt = mediabox[2] - mediabox[0]
|
||||
height_pt = mediabox[3] - mediabox[1]
|
||||
|
||||
# self._artbox = [float(d) for d in page.artbox.as_list()]
|
||||
# self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
|
||||
self._artbox = [float(d) for d in page.artbox.as_list()]
|
||||
self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
|
||||
self._cropbox = [float(d) for d in page.cropbox.as_list()]
|
||||
self._mediabox = [float(d) for d in page.mediabox.as_list()]
|
||||
self._trimbox = [float(d) for d in page.trimbox.as_list()]
|
||||
@ -1039,6 +1039,16 @@ class PageInfo:
|
||||
"""Return trimbox of page in PDF coordinates."""
|
||||
return self._trimbox
|
||||
|
||||
@property
|
||||
def artbox(self) -> FloatRect:
|
||||
"""Return artbox of page in PDF coordinates."""
|
||||
return self._artbox
|
||||
|
||||
@property
|
||||
def bleedbox(self) -> FloatRect:
|
||||
"""Return bleedbox of page in PDF coordinates."""
|
||||
return self._bleedbox
|
||||
|
||||
@property
|
||||
def images(self) -> list[ImageInfo]:
|
||||
"""Return images."""
|
||||
|
||||
@ -492,7 +492,7 @@ def generate_pdfa(
|
||||
pdf_version: The minimum PDF version that the output file should be.
|
||||
At its own discretion, the PDF/A generator may raise the version,
|
||||
but should not lower it.
|
||||
pdfa_part: The desired PDF/A compliance level, such as ``'2B'``.
|
||||
pdfa_part: The desired PDF/A compliance level, such as ``'2b'``.
|
||||
progressbar_class: The class of a progress bar, which must implement
|
||||
the ProgressBar protocol. If None, no progress is reported.
|
||||
stop_on_soft_error: If there is an "soft error" such that PDF/A generation
|
||||
|
||||
@ -6,6 +6,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import secrets
|
||||
import subprocess
|
||||
import sys
|
||||
from decimal import Decimal
|
||||
from unittest.mock import patch
|
||||
|
||||
@ -16,6 +17,7 @@ from PIL import Image, UnidentifiedImageError
|
||||
|
||||
from ocrmypdf._exec import ghostscript
|
||||
from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
|
||||
from ocrmypdf.builtin_plugins.ghostscript import _repair_gs106_jpeg_corruption
|
||||
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError
|
||||
from ocrmypdf.helpers import Resolution
|
||||
|
||||
@ -165,6 +167,10 @@ class TestDuplicateFilter:
|
||||
logger.addFilter(DuplicateFilter(logger))
|
||||
return logger
|
||||
|
||||
@pytest.mark.xfail(
|
||||
(3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
|
||||
reason="https://github.com/python/cpython/pull/135858",
|
||||
)
|
||||
def test_filter_duplicate_messages(self, duplicate_filter_logger, caplog):
|
||||
log = duplicate_filter_logger
|
||||
log.error("test error message")
|
||||
@ -194,6 +200,10 @@ class TestDuplicateFilter:
|
||||
assert caplog.records[1].msg == "another error message"
|
||||
assert caplog.records[2].msg == "yet another error message"
|
||||
|
||||
@pytest.mark.xfail(
|
||||
(3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
|
||||
reason="https://github.com/python/cpython/pull/135858",
|
||||
)
|
||||
def test_filter_alt_messages(self, duplicate_filter_logger, caplog):
|
||||
log = duplicate_filter_logger
|
||||
log.error("test error message")
|
||||
@ -278,3 +288,120 @@ def test_recoverable_image_error_with_stop(pdf_with_invalid_image, outdir, caplo
|
||||
stop_on_error=True,
|
||||
)
|
||||
# out2.png will not be created; if it were it would be blank.
|
||||
|
||||
|
||||
class TestGs106JpegCorruptionRepair:
|
||||
"""Test the Ghostscript 10.6 JPEG corruption repair function."""
|
||||
|
||||
@pytest.fixture
|
||||
def create_damaged_pdf(self, resources, outdir):
|
||||
"""Create a damaged PDF by truncating JPEG data by 2 bytes."""
|
||||
|
||||
def _create_damaged(source_pdf_name='francais.pdf', truncate_bytes=2):
|
||||
source_path = resources / source_pdf_name
|
||||
damaged_path = outdir / 'damaged.pdf'
|
||||
|
||||
with pikepdf.open(source_path) as pdf:
|
||||
# Find and truncate DCTDecode images
|
||||
Name = pikepdf.Name
|
||||
damaged_count = 0
|
||||
for page in pdf.pages:
|
||||
if Name.Resources not in page:
|
||||
continue
|
||||
resources_dict = page[Name.Resources]
|
||||
if Name.XObject not in resources_dict:
|
||||
continue
|
||||
for key in resources_dict[Name.XObject].keys():
|
||||
obj = resources_dict[Name.XObject][key]
|
||||
if obj.get(Name.Subtype) != Name.Image:
|
||||
continue
|
||||
if obj.get(Name.Filter) != Name.DCTDecode:
|
||||
continue
|
||||
# Truncate the JPEG data
|
||||
original_bytes = obj.read_raw_bytes()
|
||||
truncated_bytes = original_bytes[:-truncate_bytes]
|
||||
obj.write(truncated_bytes, filter=Name.DCTDecode)
|
||||
damaged_count += 1
|
||||
|
||||
pdf.save(damaged_path)
|
||||
return source_path, damaged_path, damaged_count
|
||||
|
||||
return _create_damaged
|
||||
|
||||
def test_repair_truncated_jpeg(self, create_damaged_pdf, caplog):
|
||||
"""Test that truncated JPEG images are repaired."""
|
||||
caplog.set_level(logging.DEBUG)
|
||||
source_path, damaged_path, damaged_count = create_damaged_pdf()
|
||||
|
||||
assert damaged_count > 0, "Test PDF should have DCTDecode images"
|
||||
|
||||
# Get original image bytes for comparison
|
||||
with pikepdf.open(source_path) as pdf:
|
||||
Name = pikepdf.Name
|
||||
original_bytes_list = []
|
||||
for page in pdf.pages:
|
||||
if Name.Resources not in page:
|
||||
continue
|
||||
resources_dict = page[Name.Resources]
|
||||
if Name.XObject not in resources_dict:
|
||||
continue
|
||||
for key in resources_dict[Name.XObject].keys():
|
||||
obj = resources_dict[Name.XObject][key]
|
||||
if obj.get(Name.Subtype) != Name.Image:
|
||||
continue
|
||||
if obj.get(Name.Filter) != Name.DCTDecode:
|
||||
continue
|
||||
original_bytes_list.append(obj.read_raw_bytes())
|
||||
|
||||
# Run the repair function
|
||||
repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
|
||||
assert repaired is True, "Repair should have been performed"
|
||||
|
||||
# Verify the repaired PDF has correct image bytes
|
||||
with pikepdf.open(damaged_path) as pdf:
|
||||
Name = pikepdf.Name
|
||||
repaired_bytes_list = []
|
||||
for page in pdf.pages:
|
||||
if Name.Resources not in page:
|
||||
continue
|
||||
resources_dict = page[Name.Resources]
|
||||
if Name.XObject not in resources_dict:
|
||||
continue
|
||||
for key in resources_dict[Name.XObject].keys():
|
||||
obj = resources_dict[Name.XObject][key]
|
||||
if obj.get(Name.Subtype) != Name.Image:
|
||||
continue
|
||||
if obj.get(Name.Filter) != Name.DCTDecode:
|
||||
continue
|
||||
repaired_bytes_list.append(obj.read_raw_bytes())
|
||||
|
||||
assert len(repaired_bytes_list) == len(original_bytes_list)
|
||||
for orig, repaired_bytes in zip(original_bytes_list, repaired_bytes_list):
|
||||
assert orig == repaired_bytes, "Repaired bytes should match original"
|
||||
|
||||
# Check that error/warning was logged
|
||||
assert "JPEG corruption detected" in caplog.text
|
||||
|
||||
def test_no_repair_when_not_truncated(self, resources, outdir, caplog):
|
||||
"""Test that no repair is done when images are not truncated."""
|
||||
caplog.set_level(logging.DEBUG)
|
||||
source_path = resources / 'francais.pdf'
|
||||
|
||||
# Copy source to output (no damage)
|
||||
output_path = outdir / 'undamaged.pdf'
|
||||
with pikepdf.open(source_path) as pdf:
|
||||
pdf.save(output_path)
|
||||
|
||||
# Run the repair function - should not repair anything
|
||||
repaired = _repair_gs106_jpeg_corruption(source_path, output_path)
|
||||
assert repaired is False, "No repair should have been performed"
|
||||
assert "JPEG corruption detected" not in caplog.text
|
||||
|
||||
def test_no_repair_when_truncation_too_large(self, create_damaged_pdf, caplog):
|
||||
"""Test that images truncated by more than 15 bytes are not repaired."""
|
||||
caplog.set_level(logging.DEBUG)
|
||||
source_path, damaged_path, _ = create_damaged_pdf(truncate_bytes=20)
|
||||
|
||||
repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
|
||||
assert repaired is False, "Should not repair truncation > 15 bytes"
|
||||
assert "JPEG corruption detected" not in caplog.text
|
||||
|
||||
@ -801,7 +801,7 @@ def test_pdfa_n(pdfa_level, resources, outpdf):
|
||||
)
|
||||
|
||||
pdfa_info = file_claims_pdfa(outpdf)
|
||||
assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}B'
|
||||
assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}b'
|
||||
|
||||
|
||||
def test_decompression_bomb_error(resources, outpdf, caplog):
|
||||
|
||||
@ -6,7 +6,7 @@ from __future__ import annotations
|
||||
from io import BytesIO
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
@ -220,7 +220,7 @@ def test_find_formx(resources):
|
||||
|
||||
|
||||
def test_extract_image_filter_with_pdf_image():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 10
|
||||
@ -235,20 +235,20 @@ def test_extract_image_filter_with_pdf_image():
|
||||
|
||||
|
||||
def test_extract_image_filter_with_non_image():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Form
|
||||
assert extract_image_filter(image, None) is None
|
||||
|
||||
|
||||
def test_extract_image_filter_with_small_stream_size():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 50
|
||||
assert extract_image_filter(image, None) is None
|
||||
|
||||
|
||||
def test_extract_image_filter_with_small_dimensions():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 5
|
||||
@ -257,7 +257,7 @@ def test_extract_image_filter_with_small_dimensions():
|
||||
|
||||
|
||||
def test_extract_image_filter_with_multiple_compression_filters():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 10
|
||||
@ -268,7 +268,7 @@ def test_extract_image_filter_with_multiple_compression_filters():
|
||||
|
||||
|
||||
def test_extract_image_filter_with_wide_gamut_image():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 10
|
||||
@ -296,7 +296,7 @@ def test_extract_image_filter_with_jpeg2000_image():
|
||||
|
||||
|
||||
def test_extract_image_filter_with_ccitt_group_3_image():
|
||||
image = MagicMock()
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 10
|
||||
@ -309,7 +309,7 @@ def test_extract_image_filter_with_ccitt_group_3_image():
|
||||
|
||||
# Triggers pikepdf bug
|
||||
# def test_extract_image_filter_with_decode_table():
|
||||
# image = MagicMock()
|
||||
# image = Dictionary()
|
||||
# image.Subtype = Name.Image
|
||||
# image.Length = 200
|
||||
# image.Width = 10
|
||||
@ -319,3 +319,26 @@ def test_extract_image_filter_with_ccitt_group_3_image():
|
||||
# image.ColorSpace = Name.DeviceGray
|
||||
# image.Decode = [42, 0]
|
||||
# assert extract_image_filter(image, None) is None
|
||||
|
||||
|
||||
def test_extract_image_filter_with_rgb_smask_matte():
|
||||
image = Dictionary()
|
||||
image.Subtype = Name.Image
|
||||
image.Length = 200
|
||||
image.Width = 10
|
||||
image.Height = 10
|
||||
image.Filter = Name.FlateDecode
|
||||
image.BitsPerComponent = 8
|
||||
image.ColorSpace = Name.DeviceRGB
|
||||
image.SMask = Dictionary(
|
||||
Type=Name.Image,
|
||||
Subtype=Name.Image,
|
||||
Length=200,
|
||||
Width=10,
|
||||
Height=10,
|
||||
Filter=Name.FlateDecode,
|
||||
BitsPerComponent=8,
|
||||
ColorSpace=Name.DeviceGray,
|
||||
Matte=Array([1, 2, 3]),
|
||||
)
|
||||
assert extract_image_filter(image, None) is None
|
||||
|
||||
122
tests/test_page_boxes.py
Normal file
122
tests/test_page_boxes.py
Normal file
@ -0,0 +1,122 @@
|
||||
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pikepdf
|
||||
import pytest
|
||||
|
||||
from .conftest import check_ocrmypdf
|
||||
|
||||
page_rect = [0, 0, 612, 792]
|
||||
inset_rect = [200, 200, 612, 792]
|
||||
wh_rect = [0, 0, 412, 592]
|
||||
|
||||
neg_rect = [-100, -100, 512, 692]
|
||||
|
||||
mediabox_testdata = [
|
||||
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
||||
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
(
|
||||
'hocr',
|
||||
'pdfa',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
wh_rect,
|
||||
),
|
||||
(
|
||||
'hocr',
|
||||
'pdf',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
wh_rect,
|
||||
),
|
||||
('hocr', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
|
||||
)
|
||||
def test_media_box(
|
||||
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
||||
):
|
||||
with pikepdf.open(resources / in_pdf) as pdf:
|
||||
page = pdf.pages[0]
|
||||
page.MediaBox = crop_to
|
||||
pdf.save(outdir / 'cropped.pdf')
|
||||
args = [
|
||||
'--jobs',
|
||||
'1',
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
'--output-type',
|
||||
output_type,
|
||||
]
|
||||
if mode:
|
||||
args.append(mode)
|
||||
|
||||
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
||||
|
||||
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
||||
page = pdf.pages[0]
|
||||
assert page.MediaBox == crop_expected
|
||||
|
||||
|
||||
cropbox_testdata = [
|
||||
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
(
|
||||
'hocr',
|
||||
'pdfa',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
inset_rect,
|
||||
),
|
||||
(
|
||||
'hocr',
|
||||
'pdf',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
inset_rect,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
|
||||
)
|
||||
def test_crop_box(
|
||||
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
||||
):
|
||||
with pikepdf.open(resources / in_pdf) as pdf:
|
||||
page = pdf.pages[0]
|
||||
page.CropBox = crop_to
|
||||
pdf.save(outdir / 'cropped.pdf')
|
||||
args = [
|
||||
'--jobs',
|
||||
'1',
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
'--output-type',
|
||||
output_type,
|
||||
'--optimize',
|
||||
'0',
|
||||
]
|
||||
if mode:
|
||||
args.append(mode)
|
||||
|
||||
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
||||
|
||||
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
||||
page = pdf.pages[0]
|
||||
assert page.CropBox == crop_expected
|
||||
@ -51,20 +51,20 @@ def compare_images_monochrome(
|
||||
|
||||
with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:
|
||||
assert reference_im.mode == test_im.mode == '1'
|
||||
assert reference_im.size == test_im.size, "Images must be the same size"
|
||||
|
||||
# XOR the images: matching pixels become 0, different pixels become 1
|
||||
difference = ImageChops.logical_xor(reference_im, test_im)
|
||||
assert difference.mode == '1'
|
||||
|
||||
histogram = difference.histogram()
|
||||
assert (
|
||||
len(histogram) == 256
|
||||
), "Expected Pillow to convert to grayscale for histogram"
|
||||
|
||||
# All entries other than first and last will be 0
|
||||
count_same = histogram[0]
|
||||
count_different = histogram[-1]
|
||||
# Count matching pixels directly using getcolors()
|
||||
# For a binary image, getcolors returns [(count, 0), (count, 1)] or subset
|
||||
colors = difference.getcolors()
|
||||
color_counts = {color: count for count, color in colors}
|
||||
count_same = color_counts.get(0, 0) # 0 = matching pixels (XOR result is 0)
|
||||
count_different = color_counts.get(255, 0) # 255 = different pixels
|
||||
total = count_same + count_different
|
||||
|
||||
return count_same / (total)
|
||||
return count_same / total
|
||||
|
||||
|
||||
def test_monochrome_comparison(resources, outdir):
|
||||
@ -211,7 +211,7 @@ def test_rotate_deskew_ocr_timeout(resources, outdir):
|
||||
assert cmp > 0.95
|
||||
|
||||
|
||||
def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle):
|
||||
def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle, cropbox=None):
|
||||
memimg = BytesIO()
|
||||
with Image.open(fspath(imagefile)) as im:
|
||||
if image_angle != 0:
|
||||
@ -230,6 +230,8 @@ def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle):
|
||||
with pikepdf.open(mempdf) as pdf:
|
||||
pdf.pages[0].Rotate = page_angle
|
||||
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
|
||||
if cropbox:
|
||||
pdf.pages[0].CropBox = cropbox
|
||||
pdf.save(target)
|
||||
return target
|
||||
|
||||
@ -284,6 +286,44 @@ def test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):
|
||||
assert 'is a' in test_text, test_text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
|
||||
@pytest.mark.parametrize('renderer', ['sandwich', 'hocr'])
|
||||
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
||||
def test_rotate_and_crop(
|
||||
resources, outdir, page_rotate_angle, renderer, output_type, caplog
|
||||
):
|
||||
cropbox = (100, 200, 1000, 800)
|
||||
reference = make_rotate_test(
|
||||
resources / 'typewriter.png', outdir, 'ref', 0, 0, cropbox
|
||||
)
|
||||
test = make_rotate_test(
|
||||
resources / 'typewriter.png',
|
||||
outdir,
|
||||
'test',
|
||||
-page_rotate_angle,
|
||||
page_rotate_angle,
|
||||
cropbox,
|
||||
)
|
||||
out = test.with_suffix('.out.pdf')
|
||||
|
||||
exitcode = run_ocrmypdf_api(
|
||||
test,
|
||||
out,
|
||||
'-O0',
|
||||
'--rotate-pages',
|
||||
'--rotate-pages-threshold',
|
||||
'0',
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
'--output-type',
|
||||
output_type,
|
||||
'--no-progress-bar',
|
||||
)
|
||||
assert exitcode == 0, caplog.text
|
||||
|
||||
assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.9
|
||||
|
||||
|
||||
def test_rasterize_rotates(resources, tmp_path):
|
||||
pm = get_plugin_manager([])
|
||||
|
||||
|
||||
@ -3,6 +3,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
@ -11,16 +13,21 @@ from .conftest import is_linux, run_ocrmypdf_api
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_linux(), reason='semfree plugin only works on Linux')
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info >= (3, 14),
|
||||
reason='semfree plugin only works on Python 3.13 or earlier',
|
||||
)
|
||||
def test_semfree(resources, outpdf):
|
||||
exitcode = run_ocrmypdf_api(
|
||||
resources / 'multipage.pdf',
|
||||
outpdf,
|
||||
'--skip-text',
|
||||
'--skip-big',
|
||||
'2',
|
||||
'--plugin',
|
||||
'ocrmypdf.extra_plugins.semfree',
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
)
|
||||
assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)
|
||||
with pytest.warns(DeprecationWarning, match="semfree.py is deprecated"):
|
||||
exitcode = run_ocrmypdf_api(
|
||||
resources / 'multipage.pdf',
|
||||
outpdf,
|
||||
'--skip-text',
|
||||
'--skip-big',
|
||||
'2',
|
||||
'--plugin',
|
||||
'ocrmypdf.extra_plugins.semfree',
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
)
|
||||
assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)
|
||||
|
||||
@ -48,6 +48,7 @@ def test_stdout(ocrmypdf_exec, resources, outpdf):
|
||||
assert check_pdf(output_file)
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.name == 'nt', reason='Windows does not support /dev/null')
|
||||
def test_dev_null(resources):
|
||||
if 'COV_CORE_DATAFILE' in os.environ:
|
||||
pytest.skip("Coverage uses stdout")
|
||||
|
||||
54
tests/test_watcher.py
Normal file
54
tests/test_watcher.py
Normal file
@ -0,0 +1,54 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
watchdog = pytest.importorskip('watchdog')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('year_month', [True, False])
|
||||
def test_watcher(tmp_path, resources, year_month):
|
||||
input_dir = tmp_path / 'input'
|
||||
input_dir.mkdir()
|
||||
output_dir = tmp_path / 'output'
|
||||
output_dir.mkdir()
|
||||
processed_dir = tmp_path / 'processed'
|
||||
processed_dir.mkdir()
|
||||
|
||||
if year_month:
|
||||
env_extra = {'OCR_OUTPUT_DIRECTORY_YEAR_MONTH': '1'}
|
||||
else:
|
||||
env_extra = {}
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
Path(__file__).parent.parent / 'misc' / 'watcher.py',
|
||||
str(input_dir),
|
||||
str(output_dir),
|
||||
str(processed_dir),
|
||||
],
|
||||
cwd=str(tmp_path),
|
||||
env=os.environ.copy() | env_extra,
|
||||
)
|
||||
time.sleep(5)
|
||||
|
||||
shutil.copy(resources / 'trivial.pdf', input_dir / 'trivial.pdf')
|
||||
time.sleep(5)
|
||||
|
||||
if year_month:
|
||||
assert (
|
||||
output_dir
|
||||
/ f'{datetime.date.today().year}'
|
||||
/ f'{datetime.date.today().month:02d}'
|
||||
/ 'trivial.pdf'
|
||||
).exists()
|
||||
else:
|
||||
assert (output_dir / 'trivial.pdf').exists()
|
||||
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
Loading…
x
Reference in New Issue
Block a user