mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-06-26 23:49:59 +00:00
Merge branch 'feature/jbig2thresh' into v15
This commit is contained in:
commit
0388c23ae7
@ -38,6 +38,7 @@ __ocrmypdf_arguments()
|
||||
--jpeg-quality (JPEG quality [0..100])
|
||||
--png-quality (PNG quality [0..100])
|
||||
--jbig2-lossy (enable lossy JBIG2 (see docs))
|
||||
--jbig2-threshold (set JBIG2 threshold (see docs))
|
||||
--pages (apply OCR to only the specified pages)
|
||||
--max-image-mpixels (image decompression bomb threshold)
|
||||
--pdf-renderer (select PDF renderer options)
|
||||
|
@ -84,6 +84,7 @@ complete -c ocrmypdf -x -l skip-big -d "skip OCR on pages larger than this many
|
||||
complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
|
||||
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
|
||||
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
|
||||
complete -c ocrmypdf -x -l jbig2-threshold -d "JBIG2 compression threshold (see docs)"
|
||||
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
|
||||
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
|
||||
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"
|
||||
|
@ -23,15 +23,17 @@ def available():
|
||||
return True
|
||||
|
||||
|
||||
def convert_group(*, cwd, infiles, out_prefix):
|
||||
def convert_group(*, cwd, infiles, out_prefix, threshold):
|
||||
args = [
|
||||
'jbig2',
|
||||
'-b',
|
||||
out_prefix,
|
||||
'-s', # symbol mode (lossy)
|
||||
'--symbol-mode', # symbol mode (lossy)
|
||||
'-t',
|
||||
str(threshold), # threshold
|
||||
# '-r', # refinement mode (lossless symbol mode, currently disabled in
|
||||
# jbig2)
|
||||
'-p',
|
||||
'--pdf',
|
||||
]
|
||||
args.extend(infiles)
|
||||
proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
|
||||
@ -40,11 +42,13 @@ def convert_group(*, cwd, infiles, out_prefix):
|
||||
|
||||
|
||||
def convert_group_mp(args):
|
||||
return convert_group(cwd=args[0], infiles=args[1], out_prefix=args[2])
|
||||
return convert_group(
|
||||
cwd=args[0], infiles=args[1], out_prefix=args[2], threshold=args[3]
|
||||
)
|
||||
|
||||
|
||||
def convert_single(*, cwd, infile, outfile):
|
||||
args = ['jbig2', '-p', infile]
|
||||
def convert_single(*, cwd, infile, outfile, threshold):
|
||||
args = ['jbig2', '--pdf', '-t', str(threshold), infile]
|
||||
with open(outfile, 'wb') as fstdout:
|
||||
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
|
||||
proc.check_returncode()
|
||||
@ -52,4 +56,6 @@ def convert_single(*, cwd, infile, outfile):
|
||||
|
||||
|
||||
def convert_single_mp(args):
|
||||
return convert_single(cwd=args[0], infile=args[1], outfile=args[2])
|
||||
return convert_single(
|
||||
cwd=args[0], infile=args[1], outfile=args[2], threshold=args[3]
|
||||
)
|
||||
|
@ -86,6 +86,16 @@ def add_options(parser):
|
||||
# Adjust number of pages to consider at once for JBIG2 compression
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-threshold',
|
||||
type=numeric(float, 0.4, 0.9),
|
||||
default=0.85,
|
||||
metavar='T',
|
||||
help=(
|
||||
"Adjust JBIG2 symbol code classification threshold "
|
||||
"(default 0.85), range 0.4 to 0.9."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
|
@ -367,6 +367,7 @@ def _produce_jbig2_images(
|
||||
fspath(root), # =cwd
|
||||
(img_name(root, xref, ext) for xref, ext in xref_exts), # =infiles
|
||||
prefix, # =out_prefix
|
||||
options.jbig2_threshold,
|
||||
)
|
||||
|
||||
def jbig2_single_args(root, groups: dict[int, list[XrefExt]]):
|
||||
@ -379,6 +380,7 @@ def _produce_jbig2_images(
|
||||
fspath(root),
|
||||
img_name(root, xref, ext),
|
||||
root / f'{prefix}.{n:04d}',
|
||||
options.jbig2_threshold,
|
||||
)
|
||||
|
||||
if options.jbig2_page_group_size > 1:
|
||||
@ -737,6 +739,7 @@ def main(infile, outfile, level, jobs=1):
|
||||
self.png_quality = png_quality
|
||||
self.jbig2_page_group_size = 0
|
||||
self.jbig2_lossy = jb2lossy
|
||||
self.jbig2_threshold = 0.85
|
||||
self.quiet = True
|
||||
self.progress_bar = False
|
||||
|
||||
|
@ -94,6 +94,8 @@ def test_jbig2_lossy(lossy, resources, outpdf):
|
||||
'20',
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
'--jbig2-threshold',
|
||||
'0.7',
|
||||
]
|
||||
if lossy:
|
||||
args.append('--jbig2-lossy')
|
||||
|
Loading…
x
Reference in New Issue
Block a user