mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-07 12:31:25 +00:00
Refactor JBIG2 path for non-CCITT monochrome images
This commit is contained in:
parent
6171de41bf
commit
08bf651ef2
@ -39,16 +39,20 @@ JPEG_QUALITY = 75
|
|||||||
PNG_QUALITY = (65, 75)
|
PNG_QUALITY = (65, 75)
|
||||||
|
|
||||||
|
|
||||||
|
def img_name(root, xref, ext):
|
||||||
|
return str(root / '{:08d}{}'.format(xref, ext))
|
||||||
|
|
||||||
|
|
||||||
def png_name(root, xref):
|
def png_name(root, xref):
|
||||||
return str(root / '{:08d}.png'.format(xref))
|
return img_name(root, xref, '.png')
|
||||||
|
|
||||||
|
|
||||||
def jpg_name(root, xref):
|
def jpg_name(root, xref):
|
||||||
return str(root / '{:08d}.jpg'.format(xref))
|
return img_name(root, xref, '.jpg')
|
||||||
|
|
||||||
|
|
||||||
def tif_name(root, xref):
|
def tif_name(root, xref):
|
||||||
return str(root / '{:08d}.tif'.format(xref))
|
return img_name(root, xref, '.tif')
|
||||||
|
|
||||||
|
|
||||||
def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
|
def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
|
||||||
@ -75,11 +79,14 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
|
|||||||
if pim.bits_per_component == 1 \
|
if pim.bits_per_component == 1 \
|
||||||
and filtdp != '/JBIG2Decode' \
|
and filtdp != '/JBIG2Decode' \
|
||||||
and jbig2enc.available():
|
and jbig2enc.available():
|
||||||
with Path(tif_name(root, xref)).open('wb') as f:
|
try:
|
||||||
result = pim.write_stream(f)
|
imgname = Path(root / '{:08d}'.format(xref))
|
||||||
if not result:
|
with imgname.open('wb') as f:
|
||||||
|
ext = pim.extract(f)
|
||||||
|
imgname.rename(imgname.with_suffix(ext))
|
||||||
|
except pikepdf.UnsupportedImageTypeError:
|
||||||
return False
|
return False
|
||||||
jbig2s.append(xref)
|
jbig2s.append((xref, ext))
|
||||||
elif filtdp[0] == '/DCTDecode' \
|
elif filtdp[0] == '/DCTDecode' \
|
||||||
and options.optimize >= 2:
|
and options.optimize >= 2:
|
||||||
# This is a simple heuristic derived from some training data, that has
|
# This is a simple heuristic derived from some training data, that has
|
||||||
@ -98,9 +105,12 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
|
|||||||
# iccbytes = icc.read_bytes()
|
# iccbytes = icc.read_bytes()
|
||||||
# with Image.open(stream) as im:
|
# with Image.open(stream) as im:
|
||||||
# im.save(jpg_name(root, xref), icc_profile=iccbytes)
|
# im.save(jpg_name(root, xref), icc_profile=iccbytes)
|
||||||
with Path(jpg_name(root, xref)).open('wb') as f:
|
try:
|
||||||
result = pim.write_stream(f)
|
imgname = Path(root / '{:08d}'.format(xref))
|
||||||
if not result:
|
with imgname.open('wb') as f:
|
||||||
|
ext = pim.extract(f)
|
||||||
|
imgname.rename(imgname.with_suffix(ext))
|
||||||
|
except pikepdf.UnsupportedImageTypeError:
|
||||||
return False
|
return False
|
||||||
jpegs.append(xref)
|
jpegs.append(xref)
|
||||||
elif pim.indexed \
|
elif pim.indexed \
|
||||||
@ -186,12 +196,12 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options):
|
|||||||
with concurrent.futures.ThreadPoolExecutor(
|
with concurrent.futures.ThreadPoolExecutor(
|
||||||
max_workers=options.jobs) as executor:
|
max_workers=options.jobs) as executor:
|
||||||
futures = []
|
futures = []
|
||||||
for group, xrefs in jbig2_groups.items():
|
for group, xref_exts in jbig2_groups.items():
|
||||||
prefix = 'group{:08d}'.format(group)
|
prefix = 'group{:08d}'.format(group)
|
||||||
future = executor.submit(
|
future = executor.submit(
|
||||||
jbig2enc.convert_group,
|
jbig2enc.convert_group,
|
||||||
cwd=str(root),
|
cwd=str(root),
|
||||||
infiles=(png_name(root, xref) for xref in xrefs),
|
infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
|
||||||
out_prefix=prefix
|
out_prefix=prefix
|
||||||
)
|
)
|
||||||
futures.append(future)
|
futures.append(future)
|
||||||
@ -199,12 +209,13 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options):
|
|||||||
proc = future.result()
|
proc = future.result()
|
||||||
log.debug(proc.stderr.decode())
|
log.debug(proc.stderr.decode())
|
||||||
|
|
||||||
for group, xrefs in jbig2_groups.items():
|
for group, xref_exts in jbig2_groups.items():
|
||||||
prefix = 'group{:08d}'.format(group)
|
prefix = 'group{:08d}'.format(group)
|
||||||
jbig2_globals_data = (root / (prefix + '.sym')).read_bytes()
|
jbig2_globals_data = (root / (prefix + '.sym')).read_bytes()
|
||||||
jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)
|
jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)
|
||||||
|
|
||||||
for n, xref in enumerate(xrefs):
|
for n, xref_ext in enumerate(xref_exts):
|
||||||
|
xref, ext = xref_ext
|
||||||
jbig2_im_file = root / (prefix + '.{:04d}'.format(n))
|
jbig2_im_file = root / (prefix + '.{:04d}'.format(n))
|
||||||
jbig2_im_data = jbig2_im_file.read_bytes()
|
jbig2_im_data = jbig2_im_file.read_bytes()
|
||||||
im_obj = pike._get_object_id(xref, 0)
|
im_obj = pike._get_object_id(xref, 0)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user