mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 10:50:29 +00:00
115 lines
3.6 KiB
Python
115 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
#
|
|
|
|
from subprocess import Popen, PIPE
|
|
import PyPDF2 as pypdf
|
|
from decimal import Decimal, getcontext
|
|
|
|
|
|
FRIENDLY_COLORSPACE = {
|
|
'/DeviceGray': 'gray',
|
|
'/CalGray': 'gray',
|
|
'/DeviceRGB': 'rgb',
|
|
'/CalRGB': 'rgb',
|
|
'/DeviceCMYK': 'cmyk',
|
|
'/Lab': 'lab',
|
|
'/ICCBased': 'icc',
|
|
'/Indexed': 'index',
|
|
'/Separation': 'sep',
|
|
'/DeviceN': 'devn',
|
|
'/Pattern': '-'
|
|
}
|
|
|
|
FRIENDLY_ENCODING = {
|
|
'/CCITTFaxDecode': 'ccitt',
|
|
'/DCTDecode': 'jpeg',
|
|
'/JPXDecode': 'jpx',
|
|
'/JBIG2Decode': 'jbig2',
|
|
}
|
|
|
|
FRIENDLY_COMP = {
|
|
'gray': 1,
|
|
'rgb': 3,
|
|
'cmyk': 4,
|
|
'lab': 3,
|
|
}
|
|
|
|
|
|
def _pdf_get_pageinfo(infile, page: int):
|
|
pageinfo = {}
|
|
pageinfo['pageno'] = page
|
|
pageinfo['images'] = []
|
|
|
|
p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
|
|
'-raw', '-nopgbrk', infile, '-'],
|
|
close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
universal_newlines=True)
|
|
text, _ = p_pdftotext.communicate()
|
|
if len(text.strip()) > 0:
|
|
pageinfo['has_text'] = True
|
|
else:
|
|
pageinfo['has_text'] = False
|
|
|
|
pdf = pypdf.PdfFileReader(infile)
|
|
page = pdf.pages[page - 1]
|
|
width_pt = page['/MediaBox'][2] - page['/MediaBox'][0]
|
|
height_pt = page['/MediaBox'][3] - page['/MediaBox'][1]
|
|
pageinfo['width_inches'] = width_pt / Decimal(72.0)
|
|
pageinfo['height_inches'] = height_pt / Decimal(72.0)
|
|
|
|
if '/XObject' not in page['/Resources']:
|
|
# Missing /XObject means no images or possibly corrupt PDF
|
|
return pageinfo
|
|
|
|
for xobj in page['/Resources']['/XObject']:
|
|
# PyPDF2 returns the keys as an iterator
|
|
pdfimage = page['/Resources']['/XObject'][xobj]
|
|
if pdfimage['/Subtype'] != '/Image':
|
|
continue
|
|
if '/ImageMask' in pdfimage:
|
|
if pdfimage['/ImageMask']:
|
|
continue
|
|
image = {}
|
|
image['width'] = pdfimage['/Width']
|
|
image['height'] = pdfimage['/Height']
|
|
image['bpc'] = pdfimage['/BitsPerComponent']
|
|
if '/Filter' in pdfimage:
|
|
filter_ = pdfimage['/Filter']
|
|
if isinstance(filter_, pypdf.generic.ArrayObject):
|
|
filter_ = filter_[0]
|
|
image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image')
|
|
else:
|
|
image['enc'] = 'image'
|
|
if '/ColorSpace' in pdfimage:
|
|
cs = pdfimage['/ColorSpace']
|
|
if isinstance(cs, pypdf.generic.ArrayObject):
|
|
cs = cs[0]
|
|
image['color'] = FRIENDLY_COLORSPACE.get(cs, '-')
|
|
else:
|
|
image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
|
|
|
|
image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
|
|
image['dpi_w'] = image['width'] / pageinfo['width_inches']
|
|
image['dpi_h'] = image['height'] / pageinfo['height_inches']
|
|
image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** Decimal(0.5)
|
|
pageinfo['images'].append(image)
|
|
|
|
if pageinfo['images']:
|
|
xres = max(image['dpi_w'] for image in pageinfo['images'])
|
|
yres = max(image['dpi_h'] for image in pageinfo['images'])
|
|
pageinfo['xres'], pageinfo['yres'] = xres, yres
|
|
pageinfo['width_pixels'] = \
|
|
int(round(xres * pageinfo['width_inches']))
|
|
pageinfo['height_pixels'] = \
|
|
int(round(yres * pageinfo['height_inches']))
|
|
rx, ry = pageinfo['xres'], pageinfo['yres']
|
|
pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
|
|
|
|
return pageinfo
|
|
|
|
|
|
def pdf_get_all_pageinfo(infile):
|
|
pdf = pypdf.PdfFileReader(infile)
|
|
getcontext().prec = 6
|
|
return [_pdf_get_pageinfo(infile, n) for n in range(pdf.numPages)]
|