Merge branch 'feature/jbig2thresh' into v15

This commit is contained in:
James R. Barlow 2023-09-21 00:07:05 -07:00
commit 0388c23ae7
No known key found for this signature in database
GPG Key ID: E54A300D567E1260
6 changed files with 30 additions and 7 deletions

View File

@ -38,6 +38,7 @@ __ocrmypdf_arguments()
--jpeg-quality (JPEG quality [0..100])
--png-quality (PNG quality [0..100])
--jbig2-lossy (enable lossy JBIG2 (see docs))
--jbig2-threshold (set JBIG2 threshold (see docs))
--pages (apply OCR to only the specified pages)
--max-image-mpixels (image decompression bomb threshold)
--pdf-renderer (select PDF renderer options)

View File

@ -84,6 +84,7 @@ complete -c ocrmypdf -x -l skip-big -d "skip OCR on pages larger than this many
complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
complete -c ocrmypdf -x -l jbig2-threshold -d "JBIG2 compression threshold (see docs)"
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"

View File

@ -23,15 +23,17 @@ def available():
return True
def convert_group(*, cwd, infiles, out_prefix):
def convert_group(*, cwd, infiles, out_prefix, threshold):
args = [
'jbig2',
'-b',
out_prefix,
'-s', # symbol mode (lossy)
'--symbol-mode', # symbol mode (lossy)
'-t',
str(threshold), # threshold
# '-r', # refinement mode (lossless symbol mode, currently disabled in
# jbig2)
'-p',
'--pdf',
]
args.extend(infiles)
proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
@ -40,11 +42,13 @@ def convert_group(*, cwd, infiles, out_prefix):
def convert_group_mp(args):
return convert_group(cwd=args[0], infiles=args[1], out_prefix=args[2])
return convert_group(
cwd=args[0], infiles=args[1], out_prefix=args[2], threshold=args[3]
)
def convert_single(*, cwd, infile, outfile):
args = ['jbig2', '-p', infile]
def convert_single(*, cwd, infile, outfile, threshold):
args = ['jbig2', '--pdf', '-t', str(threshold), infile]
with open(outfile, 'wb') as fstdout:
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
proc.check_returncode()
@ -52,4 +56,6 @@ def convert_single(*, cwd, infile, outfile):
def convert_single_mp(args):
return convert_single(cwd=args[0], infile=args[1], outfile=args[2])
return convert_single(
cwd=args[0], infile=args[1], outfile=args[2], threshold=args[3]
)

View File

@ -86,6 +86,16 @@ def add_options(parser):
# Adjust number of pages to consider at once for JBIG2 compression
help=argparse.SUPPRESS,
)
optimizing.add_argument(
'--jbig2-threshold',
type=numeric(float, 0.4, 0.9),
default=0.85,
metavar='T',
help=(
"Adjust JBIG2 symbol code classification threshold "
"(default 0.85), range 0.4 to 0.9."
),
)
@hookimpl

View File

@ -367,6 +367,7 @@ def _produce_jbig2_images(
fspath(root), # =cwd
(img_name(root, xref, ext) for xref, ext in xref_exts), # =infiles
prefix, # =out_prefix
options.jbig2_threshold,
)
def jbig2_single_args(root, groups: dict[int, list[XrefExt]]):
@ -379,6 +380,7 @@ def _produce_jbig2_images(
fspath(root),
img_name(root, xref, ext),
root / f'{prefix}.{n:04d}',
options.jbig2_threshold,
)
if options.jbig2_page_group_size > 1:
@ -737,6 +739,7 @@ def main(infile, outfile, level, jobs=1):
self.png_quality = png_quality
self.jbig2_page_group_size = 0
self.jbig2_lossy = jb2lossy
self.jbig2_threshold = 0.85
self.quiet = True
self.progress_bar = False

View File

@ -94,6 +94,8 @@ def test_jbig2_lossy(lossy, resources, outpdf):
'20',
'--plugin',
'tests/plugins/tesseract_noop.py',
'--jbig2-threshold',
'0.7',
]
if lossy:
args.append('--jbig2-lossy')