2019-06-22 02:33:04 -07:00
|
|
|
# © 2019 James R. Barlow: github.com/jbarlow83
|
|
|
|
#
|
2020-08-05 00:44:42 -07:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
2019-06-22 02:33:04 -07:00
|
|
|
|
2019-10-22 01:49:38 -07:00
|
|
|
import logging
|
2019-09-20 17:17:11 -07:00
|
|
|
from unittest.mock import patch
|
2019-06-22 02:33:04 -07:00
|
|
|
|
2020-06-03 13:24:55 -07:00
|
|
|
import pikepdf
|
2019-06-22 02:33:04 -07:00
|
|
|
import pytest
|
|
|
|
|
2020-08-05 01:35:26 -07:00
|
|
|
from ocrmypdf import _validation as vd
|
2020-05-16 01:50:37 -07:00
|
|
|
from ocrmypdf._plugin_manager import get_plugin_manager
|
2019-06-22 02:33:04 -07:00
|
|
|
from ocrmypdf.api import create_options
|
2020-05-02 03:34:31 -07:00
|
|
|
from ocrmypdf.cli import get_parser
|
2019-12-19 15:29:56 -08:00
|
|
|
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
|
2019-09-15 01:47:31 -07:00
|
|
|
from ocrmypdf.pdfinfo import PdfInfo
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
2020-06-08 17:10:27 -07:00
|
|
|
def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
|
2019-10-22 01:49:38 -07:00
|
|
|
if language is not None:
|
|
|
|
kwargs['language'] = language
|
2020-05-16 03:24:31 -07:00
|
|
|
parser = get_parser()
|
|
|
|
pm = get_plugin_manager(kwargs.get('plugins', []))
|
2020-06-09 15:27:14 -07:00
|
|
|
pm.hook.add_options(parser=parser) # pylint: disable=no-member
|
2020-06-08 17:10:27 -07:00
|
|
|
return (
|
|
|
|
create_options(
|
|
|
|
input_file=input_file, output_file=output_file, parser=parser, **kwargs
|
|
|
|
),
|
|
|
|
pm,
|
2020-05-02 03:34:31 -07:00
|
|
|
)
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
2020-06-08 17:10:27 -07:00
|
|
|
def make_opts(*args, **kwargs):
|
|
|
|
opts, _pm = make_opts_pm(*args, **kwargs)
|
|
|
|
return opts
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_hocr_notlatin_warning(caplog):
|
2020-06-10 17:08:00 -07:00
|
|
|
# Bypass the test to see if the language is installed; we just want to pretend
|
|
|
|
# that a non-Latin language is installed
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(
|
|
|
|
*make_opts_pm(language='chi_sim', pdf_renderer='hocr', output_type='pdfa'),
|
|
|
|
{'chi_sim'},
|
|
|
|
)
|
2019-06-22 02:33:04 -07:00
|
|
|
assert 'PDF renderer is known to cause' in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_old_ghostscript(caplog):
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch(
|
|
|
|
'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True
|
2020-06-12 12:59:46 -07:00
|
|
|
):
|
|
|
|
vd._check_options(
|
|
|
|
*make_opts_pm(language='chi_sim', output_type='pdfa'), {'chi_sim'}
|
|
|
|
)
|
2020-09-22 14:32:10 -07:00
|
|
|
assert 'does not work correctly' in caplog.text
|
2019-06-22 02:33:04 -07:00
|
|
|
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'), patch(
|
|
|
|
'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True
|
2019-09-04 01:17:52 -07:00
|
|
|
):
|
2019-06-22 02:33:04 -07:00
|
|
|
with pytest.raises(MissingDependencyError):
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(*make_opts_pm(output_type='pdfa-3'), set())
|
2019-06-22 02:33:04 -07:00
|
|
|
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'), patch(
|
|
|
|
'ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True
|
2019-09-04 01:17:52 -07:00
|
|
|
):
|
2019-06-22 02:33:04 -07:00
|
|
|
with pytest.raises(MissingDependencyError):
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(*make_opts_pm(), set())
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_old_tesseract_error():
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=False):
|
2019-06-22 02:33:04 -07:00
|
|
|
with pytest.raises(MissingDependencyError):
|
|
|
|
opts = make_opts(pdf_renderer='sandwich', language='eng')
|
2020-05-25 01:31:46 -07:00
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(opts, plugin_manager, {'eng'})
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_lossless_redo():
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_options_output(make_opts(redo_ocr=True, deskew=True))
|
|
|
|
|
|
|
|
|
|
|
|
def test_mutex_options():
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_options_ocr_behavior(make_opts(force_ocr=True, skip_text=True))
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_options_ocr_behavior(make_opts(redo_ocr=True, skip_text=True))
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_options_ocr_behavior(make_opts(redo_ocr=True, force_ocr=True))
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_options_ocr_behavior(make_opts(pages='1-3', sidecar='file.txt'))
|
|
|
|
|
|
|
|
|
|
|
|
def test_optimizing(caplog):
|
|
|
|
vd.check_options_optimizing(
|
|
|
|
make_opts(optimize=0, jbig2_lossy=True, png_quality=18, jpeg_quality=10)
|
|
|
|
)
|
|
|
|
assert 'will be ignored because' in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_user_words(caplog):
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
|
2020-05-25 01:31:46 -07:00
|
|
|
opts = make_opts(user_words='foo')
|
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(opts, plugin_manager, set())
|
2019-07-27 02:14:59 -07:00
|
|
|
assert '4.0 ignores --user-words' in caplog.text
|
|
|
|
caplog.clear()
|
2020-06-09 14:55:54 -07:00
|
|
|
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
|
2020-05-25 01:31:46 -07:00
|
|
|
opts = make_opts(user_patterns='foo')
|
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(opts, plugin_manager, set())
|
2019-07-27 02:14:59 -07:00
|
|
|
assert '4.0 ignores --user-words' not in caplog.text
|
2019-06-22 02:33:04 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_pillow_options():
|
|
|
|
vd.check_options_pillow(make_opts(max_image_mpixels=0))
|
|
|
|
|
|
|
|
|
|
|
|
def test_output_tty():
|
|
|
|
with patch('sys.stdout.isatty', return_value=True):
|
|
|
|
with pytest.raises(BadArgsError):
|
|
|
|
vd.check_requested_output_file(make_opts(output_file='-'))
|
|
|
|
|
|
|
|
|
|
|
|
def test_report_file_size(tmp_path, caplog):
|
|
|
|
in_ = tmp_path / 'a.pdf'
|
|
|
|
out = tmp_path / 'b.pdf'
|
2020-06-03 13:24:55 -07:00
|
|
|
pdf = pikepdf.new()
|
|
|
|
pdf.save(in_)
|
|
|
|
pdf.save(out)
|
2020-08-05 01:35:26 -07:00
|
|
|
opts = make_opts(output_type='pdf')
|
2019-06-22 02:33:04 -07:00
|
|
|
vd.report_output_file_size(opts, in_, out)
|
|
|
|
assert caplog.text == ''
|
2019-11-09 14:21:57 -08:00
|
|
|
caplog.clear()
|
2019-06-22 02:33:04 -07:00
|
|
|
|
2020-06-03 13:24:55 -07:00
|
|
|
waste_of_space = b'Dummy' * 5000
|
2020-11-03 01:30:31 -08:00
|
|
|
pdf.Root.Dummy = waste_of_space
|
2020-06-03 13:24:55 -07:00
|
|
|
pdf.save(in_)
|
2020-11-03 01:30:31 -08:00
|
|
|
pdf.Root.Dummy2 = waste_of_space + waste_of_space
|
2020-06-03 13:24:55 -07:00
|
|
|
pdf.save(out)
|
|
|
|
|
2019-11-12 01:14:21 -08:00
|
|
|
with patch('ocrmypdf._validation.jbig2enc.available', return_value=True), patch(
|
|
|
|
'ocrmypdf._validation.pngquant.available', return_value=True
|
|
|
|
):
|
2019-11-11 22:58:48 -08:00
|
|
|
vd.report_output_file_size(opts, in_, out)
|
|
|
|
assert 'No reason' in caplog.text
|
2019-11-09 14:21:57 -08:00
|
|
|
caplog.clear()
|
|
|
|
|
2019-11-12 01:14:21 -08:00
|
|
|
with patch('ocrmypdf._validation.jbig2enc.available', return_value=False), patch(
|
|
|
|
'ocrmypdf._validation.pngquant.available', return_value=True
|
|
|
|
):
|
2019-11-09 14:21:57 -08:00
|
|
|
vd.report_output_file_size(opts, in_, out)
|
|
|
|
assert 'optional dependency' in caplog.text
|
|
|
|
caplog.clear()
|
|
|
|
|
2020-08-05 01:35:26 -07:00
|
|
|
opts = make_opts(in_, out, optimize=0, output_type='pdf')
|
2019-11-09 14:21:57 -08:00
|
|
|
vd.report_output_file_size(opts, in_, out)
|
|
|
|
assert 'disabled' in caplog.text
|
|
|
|
caplog.clear()
|
2019-09-15 01:47:31 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_false_action_store_true():
|
|
|
|
opts = make_opts(keep_temporary_files=True)
|
2020-05-03 00:51:17 -07:00
|
|
|
assert opts.keep_temporary_files
|
2019-09-15 01:47:31 -07:00
|
|
|
opts = make_opts(keep_temporary_files=False)
|
2020-05-03 00:51:17 -07:00
|
|
|
assert not opts.keep_temporary_files
|
2019-09-15 01:47:31 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('progress_bar', [True, False])
|
|
|
|
def test_no_progress_bar(progress_bar, resources):
|
|
|
|
opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf'))
|
2020-05-16 01:50:37 -07:00
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2021-01-19 14:15:07 -08:00
|
|
|
with patch('ocrmypdf.builtin_plugins.concurrency.tqdm', autospec=True) as tqdmpatch:
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(opts, plugin_manager, set())
|
2019-09-15 01:47:31 -07:00
|
|
|
pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar)
|
2019-09-20 17:17:11 -07:00
|
|
|
assert pdfinfo is not None
|
2019-09-15 01:47:31 -07:00
|
|
|
assert tqdmpatch.called
|
|
|
|
_args, kwargs = tqdmpatch.call_args
|
|
|
|
assert kwargs['disable'] != progress_bar
|
2019-10-22 01:49:38 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_language_warning(caplog):
|
|
|
|
opts = make_opts(language=None)
|
2020-05-16 01:50:37 -07:00
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2019-10-22 01:49:38 -07:00
|
|
|
caplog.set_level(logging.DEBUG)
|
|
|
|
with patch(
|
|
|
|
'ocrmypdf._validation.locale.getlocale', return_value=('en_US', 'UTF-8')
|
2020-12-28 23:51:55 -08:00
|
|
|
) as mock:
|
2020-06-12 12:59:46 -07:00
|
|
|
vd.check_options_languages(opts, {'eng'})
|
2020-05-25 03:20:10 -07:00
|
|
|
assert opts.languages == {'eng'}
|
2019-10-22 01:49:38 -07:00
|
|
|
assert '' in caplog.text
|
2020-12-28 23:51:55 -08:00
|
|
|
mock.assert_called_once()
|
2019-10-22 01:49:38 -07:00
|
|
|
|
2019-10-24 18:17:03 -07:00
|
|
|
opts = make_opts(language=None)
|
2019-10-22 01:49:38 -07:00
|
|
|
with patch(
|
|
|
|
'ocrmypdf._validation.locale.getlocale', return_value=('fr_FR', 'UTF-8')
|
2020-12-28 23:51:55 -08:00
|
|
|
) as mock:
|
2020-06-12 12:59:46 -07:00
|
|
|
vd.check_options_languages(opts, {'eng'})
|
2020-05-25 03:20:10 -07:00
|
|
|
assert opts.languages == {'eng'}
|
2019-10-22 01:49:38 -07:00
|
|
|
assert 'assuming --language' in caplog.text
|
2020-12-28 23:51:55 -08:00
|
|
|
mock.assert_called_once()
|
2020-04-10 01:27:46 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_version_comparison():
|
|
|
|
vd.check_external_program(
|
|
|
|
program="dummy_basic",
|
|
|
|
package="dummy",
|
|
|
|
version_checker=lambda: '9.0',
|
|
|
|
need_version='8.0.2',
|
|
|
|
)
|
|
|
|
vd.check_external_program(
|
|
|
|
program="dummy_doubledigit",
|
|
|
|
package="dummy",
|
|
|
|
version_checker=lambda: '10.0',
|
|
|
|
need_version='8.0.2',
|
|
|
|
)
|
|
|
|
vd.check_external_program(
|
|
|
|
program="tesseract",
|
|
|
|
package="tesseract",
|
|
|
|
version_checker=lambda: '4.0.0-beta.1',
|
|
|
|
need_version='4.0.0',
|
|
|
|
)
|
2020-04-10 13:42:33 -07:00
|
|
|
vd.check_external_program(
|
|
|
|
program="tesseract",
|
|
|
|
package="tesseract",
|
|
|
|
version_checker=lambda: 'v5.0.0-alpha.20200201',
|
|
|
|
need_version='4.0.0',
|
|
|
|
)
|
2020-04-10 01:27:46 -07:00
|
|
|
with pytest.raises(MissingDependencyError):
|
|
|
|
vd.check_external_program(
|
|
|
|
program="dummy_fails",
|
|
|
|
package="dummy",
|
|
|
|
version_checker=lambda: '1.0',
|
|
|
|
need_version='2.0',
|
|
|
|
)
|
2020-05-12 01:05:57 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_optional_program_recommended(caplog):
|
|
|
|
caplog.clear()
|
|
|
|
|
|
|
|
def raiser():
|
|
|
|
raise FileNotFoundError('jbig2')
|
|
|
|
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
vd.check_external_program(
|
|
|
|
program="jbig2",
|
|
|
|
package="jbig2enc",
|
|
|
|
version_checker=raiser,
|
|
|
|
need_version='42',
|
|
|
|
required_for='this test case',
|
|
|
|
recommended=True,
|
|
|
|
)
|
|
|
|
assert any(
|
|
|
|
(loglevel == logging.WARNING and "recommended" in msg)
|
|
|
|
for _logger_name, loglevel, msg in caplog.record_tuples
|
|
|
|
)
|
2020-06-09 15:12:40 -07:00
|
|
|
|
|
|
|
|
2020-04-30 04:11:38 -07:00
|
|
|
def test_pagesegmode_warning(caplog):
|
|
|
|
opts = make_opts(tesseract_pagesegmode='0')
|
2020-05-25 01:31:46 -07:00
|
|
|
plugin_manager = get_plugin_manager(opts.plugins)
|
2020-06-12 12:59:46 -07:00
|
|
|
vd._check_options(opts, plugin_manager, set())
|
2020-04-30 04:11:38 -07:00
|
|
|
assert 'disable OCR' in caplog.text
|
2020-06-12 14:33:02 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_two_languages():
|
2020-12-28 23:51:55 -08:00
|
|
|
with patch('ocrmypdf._exec.tesseract.has_textonly_pdf', return_value=True) as mock:
|
2020-06-12 14:33:02 -07:00
|
|
|
vd._check_options(
|
|
|
|
*make_opts_pm(language='fakelang1+fakelang2'), {'fakelang1', 'fakelang2'}
|
|
|
|
)
|
2020-12-28 23:51:55 -08:00
|
|
|
mock.assert_called()
|