Adjust page orientation parsing to deal with change in Tess 3.04.01

This commit is contained in:
James R. Barlow 2016-02-20 01:20:44 -08:00
parent 9b79b4a7c8
commit 71fbda8bf6
2 changed files with 22 additions and 9 deletions

View File

@ -524,9 +524,9 @@ def orient_page(
direction = {
0: '',
90: '',
90: '',
180: '',
270: ''
270: ''
}
log.info(
@ -544,8 +544,8 @@ def orient_page(
reader = pypdf.PdfFileReader(page_pdf)
page = reader.pages[0]
# Rotate opposite of orientation
rotated_page = page.rotateClockwise(orient_conf.angle)
# angle is a clockwise angle, so rotating ccw will correct the error
rotated_page = page.rotateCounterClockwise(orient_conf.angle)
writer.addPage(rotated_page)
with open(output_file, 'wb') as out:
writer.write(out)
@ -788,11 +788,12 @@ def add_text_layer(
page_text = pdf_text.getPage(0)
# The text page always will be oriented up
# The text page always will be oriented up by this stage
# but if lossless_reconstruction, pdf_image may have a rotation applied
# we can't just merge the pages, because a page can only have one /Rotate
# tag, so the differential rotation must be corrected.
# Also, pdf_image may not have its mediabox nailed to (0, 0)
# We have to eliminate the /Rotate tag (because it applies to the whole
# page) and rotate the image layer to match the text page
# Also, pdf_image may not have its mediabox nailed to (0, 0), so may need
# translation
page_image = pdf_image.getPage(0)
rotation = page_image.get('/Rotate', 0)

View File

@ -107,8 +107,20 @@ def get_orientation(input_file, language: list, timeout: float, log):
if len(parts) == 2:
osd[parts[0].strip()] = parts[1].strip()
angle = int(osd.get('Orientation in degrees', 0))
if 'Orientation' in osd:
# Tesseract < 3.04.01
# reports "Orientation in degrees" as a counterclockwise angle
# We keep it clockwise
assert 'Rotate' not in osd
angle = -angle % 360
else:
# Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01
# reports "Orientation in degrees" as a clockwise angle
assert 'Rotate' in osd
oc = OrientationConfidence(
angle=int(osd.get('Orientation in degrees', 0)),
angle=angle,
confidence=float(osd.get('Orientation confidence', 0)))
return oc