OCRmyPDF/tests/test_tesseract.py

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import os
import subprocess
from os import fspath
from pathlib import Path

import pytest

from ocrmypdf import pdfinfo
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError

from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylint: disable=redefined-outer-name


@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf,
        '--pdf-renderer',
        'sandwich',
        '--force-ocr',
        '--tesseract-timeout',
        '0',
    )

    info_in = pdfinfo.PdfInfo(infile)

    info = pdfinfo.PdfInfo(outpdf)
    for page in info:
        assert len(page.images) == 1, "skipped page was replicated"

    for n, info_out_n in enumerate(info):
        assert info_out_n.width_inches == info_in[n].width_inches, "output resized"
        assert info_out_n.height_inches == info_in[n].height_inches, "output resized"


def test_content_preservation(resources, outpdf):
    infile = resources / 'masks.pdf'

    check_ocrmypdf(infile, outpdf, '--pdf-renderer', 'hocr', '--tesseract-timeout', '0')

    info = pdfinfo.PdfInfo(outpdf)
    page = info[0]
    assert len(page.images) > 1, "masks were rasterized"


@pytest.mark.skipif(
    tesseract.version() >= tesseract.TesseractVersion('5'), reason="doesn't fool Tess 5"
)
def test_no_languages(tmp_path, monkeypatch):
    (tmp_path / 'tessdata').mkdir()
    monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))
    with pytest.raises(MissingDependencyError):
        tesseract.get_languages()


def test_image_too_large_hocr(monkeypatch, resources, outdir):
    def dummy_run(args, *, env=None, **kwargs):
        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')

    monkeypatch.setattr(tesseract, 'run', dummy_run)
    tesseract.generate_hocr(
        input_file=resources / 'crom.png',
        output_hocr=outdir / 'out.hocr',
        output_text=outdir / 'out.txt',
        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,
        pagesegmode=None,
        thresholding=0,
        user_words=None,
        user_patterns=None,
    )
    assert Path(outdir / 'out.hocr').read_text() == ''


def test_image_too_large_pdf(monkeypatch, resources, outdir):
    def dummy_run(args, *, env=None, **kwargs):
        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')

    monkeypatch.setattr(tesseract, 'run', dummy_run)
    tesseract.generate_pdf(
        input_file=resources / 'crom.png',
        output_pdf=outdir / 'pdf.pdf',
        output_text=outdir / 'txt.txt',
        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,
        pagesegmode=None,
        thresholding=0,
        user_words=None,
        user_patterns=None,
    )
    assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'
    if os.name != 'nt':  # different semantics
        assert Path(outdir / 'pdf.pdf').stat().st_size == 0


def test_timeout(caplog):
    tesseract.page_timedout(5)
    assert "took too long" in caplog.text


@pytest.mark.parametrize(
    'in_, logged',
    [
        (b'Tesseract Open Source', ''),
        (b'lots of diacritics blah blah', 'diacritics'),
        (b'Warning in pixReadMem', ''),
        (b'OSD: Weak margin', 'unsure about page orientation'),
        (b'Error in pixScanForForeground', ''),
        (b'Error in boxClipToRectangle', ''),
        (b'an unexpected error', 'an unexpected error'),
        (b'a dire warning', 'a dire warning'),
        (b'read_params_file something', 'read_params_file'),
        (b'an innocent message', 'innocent'),
        (b'\x7f\x7f\x80innocent unicode failure', 'innocent'),
    ],
)
def test_tesseract_log_output(caplog, in_, logged):
    caplog.set_level(logging.INFO)
    tesseract.tesseract_log_output(in_)
    if logged == '':
        assert caplog.text == ''
    else:
        assert logged in caplog.text


def test_tesseract_log_output_raises(caplog):
    with pytest.raises(tesseract.TesseractConfigError):
        tesseract.tesseract_log_output(b'parameter not found: moo')
    assert 'not found' in caplog.text


def test_blocked_language(resources, no_outpdf):
    infile = resources / 'masks.pdf'
    for bad_lang in ['osd', 'equ']:
        with pytest.raises(BadArgsError):
            run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2022 James R. Barlow`
			`# SPDX-License-Identifier: MPL-2.0`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00
Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`

tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`import logging`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`import os`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`import subprocess`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from os import fspath`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`from pathlib import Path`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00
Drop support for Python 3.5 2018-12-30 00:23:26 -08:00			`import pytest`

			`from ocrmypdf import pdfinfo`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec import tesseract`
More Tesseract-specific language checks to its plugin 2024-06-01 00:15:50 -07:00			`from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError`
Drop support for Python 3.5 2018-12-30 00:23:26 -08:00
More Tesseract-specific language checks to its plugin 2024-06-01 00:15:50 -07:00			`from .conftest import check_ocrmypdf, run_ocrmypdf_api`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00
Remove pytest_helpers_namespace 2021-04-07 01:56:51 -07:00			`# pylint: disable=redefined-outer-name`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00

Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])`
test: remove test code that support tess3 or tess4 testing 2019-06-03 01:33:24 -07:00			`def test_skip_pages_does_not_replicate(resources, basename, outdir):`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`infile = resources / basename`
			`outpdf = outdir / basename`

			`check_ocrmypdf(`
			`infile,`
Reformat with black 2018-12-30 01:27:49 -08:00			`outpdf,`
			`'--pdf-renderer',`
			`'sandwich',`
			`'--force-ocr',`
			`'--tesseract-timeout',`
			`'0',`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`)`

Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info_in = pdfinfo.PdfInfo(infile)`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(outpdf)`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`for page in info:`
Fix tess4 test using old-style pageinfo API 2017-05-29 13:51:21 -07:00			`assert len(page.images) == 1, "skipped page was replicated"`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00
Delinting 2020-05-03 00:51:17 -07:00			`for n, info_out_n in enumerate(info):`
Create raster PDF pages to match input page size Previously we produced a raster image, then multiplied image width by DPI to get the page size. However if there is rounding the page size may not match exactly. In this modified approach we constrain the page size to match. 2021-01-08 15:10:43 -08:00			`assert info_out_n.width_inches == info_in[n].width_inches, "output resized"`
			`assert info_out_n.height_inches == info_in[n].height_inches, "output resized"`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00

test: remove test code that support tess3 or tess4 testing 2019-06-03 01:33:24 -07:00			`def test_content_preservation(resources, outpdf):`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00			`infile = resources / 'masks.pdf'`

hOCR renderer is now default 2023-12-02 19:58:00 -08:00			`check_ocrmypdf(infile, outpdf, '--pdf-renderer', 'hocr', '--tesseract-timeout', '0')`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(outpdf)`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00			`page = info[0]`
lint: Remove shebangs from non-executable files 2018-02-24 12:38:58 -08:00			`assert len(page.images) > 1, "masks were rasterized"`
Fix "no languages" test and misuse of os.environ 2018-11-09 01:40:01 -08:00

Overhaul version checkers to prefer Version to str 2023-09-25 00:59:16 -07:00			`@pytest.mark.skipif(`
Skip fewer tests 2023-10-16 00:41:25 -07:00			`tesseract.version() >= tesseract.TesseractVersion('5'), reason="doesn't fool Tess 5"`
Overhaul version checkers to prefer Version to str 2023-09-25 00:59:16 -07:00			`)`
Remove tesseract_env, --tesseract-env 2020-06-09 00:39:53 -07:00			`def test_no_languages(tmp_path, monkeypatch):`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`(tmp_path / 'tessdata').mkdir()`
Remove tesseract_env, --tesseract-env 2020-06-09 00:39:53 -07:00			`monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))`
test: remove test code that support tess3 or tess4 testing 2019-06-03 01:33:24 -07:00			`with pytest.raises(MissingDependencyError):`
Remove tesseract_env, --tesseract-env 2020-06-09 00:39:53 -07:00			`tesseract.get_languages()`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00

			`def test_image_too_large_hocr(monkeypatch, resources, outdir):`
			`def dummy_run(args, , env=None, *kwargs):`
			`raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')`

			`monkeypatch.setattr(tesseract, 'run', dummy_run)`
			`tesseract.generate_hocr(`
			`input_file=resources / 'crom.png',`
tesseract.py: api cleanup 2020-05-06 12:37:44 -07:00			`output_hocr=outdir / 'out.hocr',`
Standardize tesseract.generate_hocr and _pdf parameters 2020-05-14 03:23:25 -07:00			`output_text=outdir / 'out.txt',`
			`languages=['eng'],`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`engine_mode=None,`
			`tessconfig=[],`
			`timeout=180.0,`
			`pagesegmode=None,`
Add new argument --tesseract-thresholding to control tesseract thresholding where available Also add missing test for --tesseract-oem 2021-12-04 16:52:23 -08:00			`thresholding=0,`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`user_words=None,`
			`user_patterns=None,`
			`)`
Use empty .hocr file instead of dummy template for symmetry with sandwich 2023-10-14 14:41:11 -07:00			`assert Path(outdir / 'out.hocr').read_text() == ''`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00

			`def test_image_too_large_pdf(monkeypatch, resources, outdir):`
			`def dummy_run(args, , env=None, *kwargs):`
			`raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')`

			`monkeypatch.setattr(tesseract, 'run', dummy_run)`
			`tesseract.generate_pdf(`
Standardize tesseract.generate_hocr and _pdf parameters 2020-05-14 03:23:25 -07:00			`input_file=resources / 'crom.png',`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`output_pdf=outdir / 'pdf.pdf',`
			`output_text=outdir / 'txt.txt',`
Standardize tesseract.generate_hocr and _pdf parameters 2020-05-14 03:23:25 -07:00			`languages=['eng'],`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`engine_mode=None,`
			`tessconfig=[],`
			`timeout=180.0,`
			`pagesegmode=None,`
Add new argument --tesseract-thresholding to control tesseract thresholding where available Also add missing test for --tesseract-oem 2021-12-04 16:52:23 -08:00			`thresholding=0,`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`user_words=None,`
			`user_patterns=None,`
			`)`
			`assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'`
Fix assert that depends on POSIX-y file handling 2020-01-06 02:02:05 -08:00			`if os.name != 'nt': # different semantics`
Remove "skip page" from tesseract interface Breaks tests/test_main.py::test_tesseract_missing_tessdata because conftest.py does not update options.tesseract_env before testing options for some reason, and tesseract.has_textonly_pdf raises an exception instead of returning False as the test assumes. 2020-05-12 04:09:29 -07:00			`assert Path(outdir / 'pdf.pdf').stat().st_size == 0`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00

			`def test_timeout(caplog):`
Delinting 2020-05-03 00:51:17 -07:00			`tesseract.page_timedout(5)`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`assert "took too long" in caplog.text`


			`@pytest.mark.parametrize(`
			`'in_, logged',`
			`[`
			`(b'Tesseract Open Source', ''),`
			`(b'lots of diacritics blah blah', 'diacritics'),`
			`(b'Warning in pixReadMem', ''),`
			`(b'OSD: Weak margin', 'unsure about page orientation'),`
			`(b'Error in pixScanForForeground', ''),`
			`(b'Error in boxClipToRectangle', ''),`
			`(b'an unexpected error', 'an unexpected error'),`
			`(b'a dire warning', 'a dire warning'),`
			`(b'read_params_file something', 'read_params_file'),`
			`(b'an innocent message', 'innocent'),`
			`(b'\x7f\x7f\x80innocent unicode failure', 'innocent'),`
			`],`
			`)`
			`def test_tesseract_log_output(caplog, in_, logged):`
The Great Logging Refactor Remove all instances of logger object being passed as parameters. This was a holdover from ruffus, and complicated a lot of simple things. 2020-03-04 21:24:13 -08:00			`caplog.set_level(logging.INFO)`
tesseract.py: api cleanup 2020-05-06 12:37:44 -07:00			`tesseract.tesseract_log_output(in_)`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`if logged == '':`
			`assert caplog.text == ''`
			`else:`
			`assert logged in caplog.text`


			`def test_tesseract_log_output_raises(caplog):`
			`with pytest.raises(tesseract.TesseractConfigError):`
tesseract.py: api cleanup 2020-05-06 12:37:44 -07:00			`tesseract.tesseract_log_output(b'parameter not found: moo')`
tests: improve tesseract coverage 2020-01-04 02:35:14 -08:00			`assert 'not found' in caplog.text`
More Tesseract-specific language checks to its plugin 2024-06-01 00:15:50 -07:00

			`def test_blocked_language(resources, no_outpdf):`
			`infile = resources / 'masks.pdf'`
			`for bad_lang in ['osd', 'equ']:`
			`with pytest.raises(BadArgsError):`
			`run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)`