From 71fbda8bf68fffa6c1d6c4cb7bb6f86f20a1ab9c Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 20 Feb 2016 01:20:44 -0800 Subject: [PATCH] Adjust page orientation parsing to deal with change in Tess 3.04.01 --- ocrmypdf/main.py | 17 +++++++++-------- ocrmypdf/tesseract.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 2dd53ccf..a1121261 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -524,9 +524,9 @@ def orient_page( direction = { 0: '⇧', - 90: '⇦', + 90: '⇨', 180: '⇩', - 270: '⇨' + 270: '⇦' } log.info( @@ -544,8 +544,8 @@ def orient_page( reader = pypdf.PdfFileReader(page_pdf) page = reader.pages[0] - # Rotate opposite of orientation - rotated_page = page.rotateClockwise(orient_conf.angle) + # angle is a clockwise angle, so rotating ccw will correct the error + rotated_page = page.rotateCounterClockwise(orient_conf.angle) writer.addPage(rotated_page) with open(output_file, 'wb') as out: writer.write(out) @@ -788,11 +788,12 @@ def add_text_layer( page_text = pdf_text.getPage(0) - # The text page always will be oriented up + # The text page always will be oriented up by this stage # but if lossless_reconstruction, pdf_image may have a rotation applied - # we can't just merge the pages, because a page can only have one /Rotate - # tag, so the differential rotation must be corrected. - # Also, pdf_image may not have its mediabox nailed to (0, 0) + # We have to eliminate the /Rotate tag (because it applies to the whole + # page) and rotate the image layer to match the text page + # Also, pdf_image may not have its mediabox nailed to (0, 0), so may need + # translation page_image = pdf_image.getPage(0) rotation = page_image.get('/Rotate', 0) diff --git a/ocrmypdf/tesseract.py b/ocrmypdf/tesseract.py index 0a205621..ab1a417b 100644 --- a/ocrmypdf/tesseract.py +++ b/ocrmypdf/tesseract.py @@ -107,8 +107,20 @@ def get_orientation(input_file, language: list, timeout: float, log): if len(parts) == 2: osd[parts[0].strip()] = parts[1].strip() + angle = int(osd.get('Orientation in degrees', 0)) + if 'Orientation' in osd: + # Tesseract < 3.04.01 + # reports "Orientation in degrees" as a counterclockwise angle + # We keep it clockwise + assert 'Rotate' not in osd + angle = -angle % 360 + else: + # Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01 + # reports "Orientation in degrees" as a clockwise angle + assert 'Rotate' in osd + oc = OrientationConfidence( - angle=int(osd.get('Orientation in degrees', 0)), + angle=angle, confidence=float(osd.get('Orientation confidence', 0))) return oc