From 4b51b521e24f8b42df6c188e5caea1b8c44fe147 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sun, 7 Feb 2016 03:27:33 -0800
Subject: [PATCH] Implement autorotate (provided lossless reconstruction is
 disabled)

Works for a single page file, probably

Although arguably rotation is not quite lossless, and the two could be
mutually exclusive anyway, so maybe this is it. Did not check in some
debugging changes (lossless=False, text debugging=True)

PyPDF seems to get merging wrong when one of the pages is rotated.
---
 ocrmypdf/main.py |  78 ++++++++++--
 pipeline.svg     | 309 ++++++++++++++++++++++++++---------------------
 2 files changed, 240 insertions(+), 147 deletions(-)

diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py
index fbd13769..b316257d 100755
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -440,7 +440,69 @@ def split_pages(
 
 @transform(
     input=split_pages,
-    filter=suffix('.ocr.page.pdf'),
+    filter=suffix('.page.pdf'),
+    output='.preview.png',
+    output_dir=work_folder,
+    extras=[_log, _pdfinfo, _pdfinfo_lock])
+def rasterize_preview(
+        input_file,
+        output_file,
+        log,
+        pdfinfo,
+        pdfinfo_lock):
+    ghostscript.rasterize_pdf(
+        input_file=input_file,
+        output_file=output_file,
+        xres=200,
+        yres=200,
+        raster_device='pnggray',
+        log=log)
+
+
+@collate(
+    input=[split_pages, rasterize_preview],
+    filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.png)"),
+    output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
+    extras=[_log, _pdfinfo, _pdfinfo_lock])
+def orient_page(
+        infiles,
+        output_file,
+        log,
+        pdfinfo,
+        pdfinfo_lock):
+
+    page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf'))
+    preview = next(ii for ii in infiles if ii.endswith('.preview.png'))
+
+    orient_conf = tesseract.get_orientation(
+        preview,
+        language=options.language,
+        timeout=options.tesseract_timeout,
+        log=log)
+    print(orient_conf)
+
+    if orient_conf.angle == 0:
+        re_symlink(page_pdf, output_file)
+    else:
+        if orient_conf.confidence < 15:
+            log.warning(
+                'Low orientation confidence {:.1f}'.format(
+                    orient_conf.confidence))
+
+        writer = pypdf.PdfFileWriter()
+        reader = pypdf.PdfFileReader(page_pdf)
+        page = reader.pages[0]
+
+        # Rotate opposite of orientation
+        rotated_page = page.rotateClockwise(orient_conf.angle)
+        writer.addPage(rotated_page)
+        with open(output_file, 'wb') as out:
+            writer.write(out)
+
+
+@transform(
+    input=orient_page,
+    filter=suffix('.ocr.oriented.pdf'),
     output='.page.png',
     output_dir=work_folder,
     extras=[_log, _pdfinfo, _pdfinfo_lock])
@@ -571,8 +633,8 @@ def select_image_for_pdf(
 
 @active_if(options.pdf_renderer == 'hocr')
 @collate(
-    input=[select_image_for_pdf, split_pages],
-    filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.page\.pdf)"),
+    input=[select_image_for_pdf, orient_page],
+    filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
     output=os.path.join(work_folder, r'\1.image-layer.pdf'),
     extras=[_log, _pdfinfo, _pdfinfo_lock])
 def select_image_layer(
@@ -582,7 +644,7 @@ def select_image_layer(
         pdfinfo,
         pdfinfo_lock):
 
-    page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf'))
+    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
     image = next(ii for ii in infiles if ii.endswith('.image'))
 
     if lossless_reconstruction:
@@ -679,8 +741,8 @@ def add_text_layer(
 
 @active_if(options.pdf_renderer == 'tesseract')
 @collate(
-    input=[preprocess_clean, split_pages],
-    filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"),
+    input=[preprocess_clean, orient_page],
+    filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.ocr\.oriented\.pdf)"),
     output=os.path.join(work_folder, r'\1.rendered.pdf'),
     extras=[_log, _pdfinfo, _pdfinfo_lock])
 def tesseract_ocr_and_render_pdf(
@@ -754,8 +816,8 @@ def generate_postscript_stub(
 
 
 @transform(
-    input=split_pages,
-    filter=suffix('.skip.page.pdf'),
+    input=orient_page,
+    filter=suffix('.skip.oriented.pdf'),
     output='.done.pdf',
     output_dir=work_folder,
     extras=[_log])
diff --git a/pipeline.svg b/pipeline.svg
index 3c639f86..09d2acc4 100644
--- a/pipeline.svg
+++ b/pipeline.svg
@@ -4,261 +4,292 @@
 <!-- Generated by graphviz version 2.38.0 (20140413.2041)
  -->
 <!-- Title: Pipeline: Pages: 1 -->
-<svg width="1132pt" height="708pt"
- viewBox="0.00 0.00 1132.00 708.08" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 704.083)">
+<svg width="1132pt" height="824pt"
+ viewBox="0.00 0.00 1132.00 824.08" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 820.083)">
 <title>Pipeline:</title>
-<polygon fill="white" stroke="none" points="-4,4 -4,-704.083 1128,-704.083 1128,4 -4,4"/>
+<polygon fill="white" stroke="none" points="-4,4 -4,-820.083 1128,-820.083 1128,4 -4,4"/>
 <g id="clust1" class="cluster"><title>clustertasks</title>
-<polygon fill="none" stroke="black" points="8,-8 8,-692.083 1116,-692.083 1116,-8 8,-8"/>
-<text text-anchor="middle" x="562" y="-664.083" font-family="Times,serif" font-size="30.00" fill="#ff3232">Pipeline:</text>
+<polygon fill="none" stroke="black" points="8,-8 8,-808.083 1116,-808.083 1116,-8 8,-8"/>
+<text text-anchor="middle" x="562" y="-780.083" font-family="Times,serif" font-size="30.00" fill="#ff3232">Pipeline:</text>
 </g>
 <!-- t0 -->
 <g id="node1" class="node"><title>t0</title>
-<polygon fill="#efa03b" stroke="black" points="936.535,-646.083 713.465,-646.083 709.465,-642.083 709.465,-610.083 932.535,-610.083 936.535,-614.083 936.535,-646.083"/>
-<polyline fill="none" stroke="black" points="932.535,-642.083 709.465,-642.083 "/>
-<polyline fill="none" stroke="black" points="932.535,-642.083 932.535,-610.083 "/>
-<polyline fill="none" stroke="black" points="932.535,-642.083 936.535,-646.083 "/>
-<text text-anchor="middle" x="823" y="-622.083" font-family="Times,serif" font-size="20.00">repair_pdf</text>
+<polygon fill="#efa03b" stroke="black" points="936.535,-762.083 713.465,-762.083 709.465,-758.083 709.465,-726.083 932.535,-726.083 936.535,-730.083 936.535,-762.083"/>
+<polyline fill="none" stroke="black" points="932.535,-758.083 709.465,-758.083 "/>
+<polyline fill="none" stroke="black" points="932.535,-758.083 932.535,-726.083 "/>
+<polyline fill="none" stroke="black" points="932.535,-758.083 936.535,-762.083 "/>
+<text text-anchor="middle" x="823" y="-738.083" font-family="Times,serif" font-size="20.00">repair_pdf</text>
 </g>
 <!-- t1 -->
 <g id="node2" class="node"><title>t1</title>
-<polygon fill="#efa03b" stroke="black" points="914.112,-567.155 710,-584.057 505.888,-567.155 506.078,-539.806 913.922,-539.806 914.112,-567.155"/>
-<polygon fill="none" stroke="black" points="918.134,-570.834 710,-588.069 501.866,-570.834 502.11,-535.808 917.89,-535.808 918.134,-570.834"/>
-<text text-anchor="middle" x="710" y="-553.596" font-family="Times,serif" font-size="20.00">split_pages</text>
+<polygon fill="#efa03b" stroke="black" points="914.112,-683.155 710,-700.057 505.888,-683.155 506.078,-655.806 913.922,-655.806 914.112,-683.155"/>
+<polygon fill="none" stroke="black" points="918.134,-686.834 710,-704.069 501.866,-686.834 502.11,-651.808 917.89,-651.808 918.134,-686.834"/>
+<text text-anchor="middle" x="710" y="-669.596" font-family="Times,serif" font-size="20.00">split_pages</text>
 </g>
 <!-- t0&#45;&gt;t1 -->
 <g id="edge1" class="edge"><title>t0&#45;&gt;t1</title>
-<path fill="none" stroke="#0044a0" d="M793.9,-609.961C783.582,-603.89 771.656,-596.873 760.092,-590.069"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="761.747,-586.982 751.353,-584.927 758.197,-593.015 761.747,-586.982"/>
+<path fill="none" stroke="#0044a0" d="M793.9,-725.961C783.582,-719.89 771.656,-712.873 760.092,-706.069"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="761.747,-702.982 751.353,-700.927 758.197,-709.015 761.747,-702.982"/>
 </g>
-<!-- t12 -->
-<g id="node14" class="node"><title>t12</title>
-<polygon fill="#efa03b" stroke="black" points="1108.08,-509.109 769.918,-509.109 765.918,-505.109 765.918,-473.109 1104.08,-473.109 1108.08,-477.109 1108.08,-509.109"/>
-<polyline fill="none" stroke="black" points="1104.08,-505.109 765.918,-505.109 "/>
-<polyline fill="none" stroke="black" points="1104.08,-505.109 1104.08,-473.109 "/>
-<polyline fill="none" stroke="black" points="1104.08,-505.109 1108.08,-509.109 "/>
-<text text-anchor="middle" x="937" y="-485.109" font-family="Times,serif" font-size="20.00">generate_postscript_stub</text>
+<!-- t14 -->
+<g id="node16" class="node"><title>t14</title>
+<polygon fill="#efa03b" stroke="black" points="1108.08,-625.109 769.918,-625.109 765.918,-621.109 765.918,-589.109 1104.08,-589.109 1108.08,-593.109 1108.08,-625.109"/>
+<polyline fill="none" stroke="black" points="1104.08,-621.109 765.918,-621.109 "/>
+<polyline fill="none" stroke="black" points="1104.08,-621.109 1104.08,-589.109 "/>
+<polyline fill="none" stroke="black" points="1104.08,-621.109 1108.08,-625.109 "/>
+<text text-anchor="middle" x="937" y="-601.109" font-family="Times,serif" font-size="20.00">generate_postscript_stub</text>
 </g>
-<!-- t0&#45;&gt;t12 -->
-<g id="edge19" class="edge"><title>t0&#45;&gt;t12</title>
-<path fill="none" stroke="#0044a0" d="M899.32,-610.004C909.979,-604.592 919.748,-597.466 927,-588.083 941.916,-568.78 943.099,-540.385 941.345,-519.486"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="944.8,-518.881 940.218,-509.328 937.843,-519.653 944.8,-518.881"/>
+<!-- t0&#45;&gt;t14 -->
+<g id="edge22" class="edge"><title>t0&#45;&gt;t14</title>
+<path fill="none" stroke="#0044a0" d="M899.32,-726.004C909.979,-720.592 919.748,-713.466 927,-704.083 941.916,-684.78 943.099,-656.385 941.345,-635.486"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="944.8,-634.881 940.218,-625.328 937.843,-635.653 944.8,-634.881"/>
 </g>
 <!-- t2 -->
 <g id="node3" class="node"><title>t2</title>
-<polygon fill="#efa03b" stroke="black" points="592.299,-509.109 241.701,-509.109 237.701,-505.109 237.701,-473.109 588.299,-473.109 592.299,-477.109 592.299,-509.109"/>
-<polyline fill="none" stroke="black" points="588.299,-505.109 237.701,-505.109 "/>
-<polyline fill="none" stroke="black" points="588.299,-505.109 588.299,-473.109 "/>
-<polyline fill="none" stroke="black" points="588.299,-505.109 592.299,-509.109 "/>
-<text text-anchor="middle" x="415" y="-485.109" font-family="Times,serif" font-size="20.00">rasterize_with_ghostscript</text>
+<polygon fill="#efa03b" stroke="black" points="708.883,-625.109 427.117,-625.109 423.117,-621.109 423.117,-589.109 704.883,-589.109 708.883,-593.109 708.883,-625.109"/>
+<polyline fill="none" stroke="black" points="704.883,-621.109 423.117,-621.109 "/>
+<polyline fill="none" stroke="black" points="704.883,-621.109 704.883,-589.109 "/>
+<polyline fill="none" stroke="black" points="704.883,-621.109 708.883,-625.109 "/>
+<text text-anchor="middle" x="566" y="-601.109" font-family="Times,serif" font-size="20.00">rasterize_preview</text>
 </g>
 <!-- t1&#45;&gt;t2 -->
 <g id="edge2" class="edge"><title>t1&#45;&gt;t2</title>
-<path fill="none" stroke="#0044a0" d="M608.89,-535.808C573.672,-527.87 534.524,-519.048 500.679,-511.42"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="501.345,-507.982 490.82,-509.198 499.806,-514.811 501.345,-507.982"/>
+<path fill="none" stroke="#0044a0" d="M660.443,-651.715C644.813,-644.498 627.615,-636.558 612.226,-629.452"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="613.454,-626.165 602.908,-625.15 610.52,-632.52 613.454,-626.165"/>
 </g>
-<!-- t7 -->
-<g id="node9" class="node"><title>t7</title>
+<!-- t3 -->
+<g id="node4" class="node"><title>t3</title>
+<polygon fill="#efa03b" stroke="black" points="766.148,-567.109 531.852,-567.109 527.852,-563.109 527.852,-531.109 762.148,-531.109 766.148,-535.109 766.148,-567.109"/>
+<polyline fill="none" stroke="black" points="762.148,-563.109 527.852,-563.109 "/>
+<polyline fill="none" stroke="black" points="762.148,-563.109 762.148,-531.109 "/>
+<polyline fill="none" stroke="black" points="762.148,-563.109 766.148,-567.109 "/>
+<text text-anchor="middle" x="647" y="-543.109" font-family="Times,serif" font-size="20.00">orient_page</text>
+</g>
+<!-- t1&#45;&gt;t3 -->
+<g id="edge4" class="edge"><title>t1&#45;&gt;t3</title>
+<path fill="none" stroke="#0044a0" d="M719.184,-651.557C724.864,-633.356 729.158,-607.968 718,-589.109 714.258,-582.785 709.086,-577.418 703.224,-572.88"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="705.107,-569.928 694.871,-567.195 701.168,-575.715 705.107,-569.928"/>
+</g>
+<!-- t2&#45;&gt;t3 -->
+<g id="edge3" class="edge"><title>t2&#45;&gt;t3</title>
+<path fill="none" stroke="#0044a0" d="M590.743,-589.003C598.123,-583.901 606.33,-578.227 614.143,-572.825"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="616.174,-575.677 622.409,-567.111 612.193,-569.919 616.174,-575.677"/>
+</g>
+<!-- t4 -->
+<g id="node5" class="node"><title>t4</title>
+<polygon fill="#efa03b" stroke="black" points="660.299,-509.109 309.701,-509.109 305.701,-505.109 305.701,-473.109 656.299,-473.109 660.299,-477.109 660.299,-509.109"/>
+<polyline fill="none" stroke="black" points="656.299,-505.109 305.701,-505.109 "/>
+<polyline fill="none" stroke="black" points="656.299,-505.109 656.299,-473.109 "/>
+<polyline fill="none" stroke="black" points="656.299,-505.109 660.299,-509.109 "/>
+<text text-anchor="middle" x="483" y="-485.109" font-family="Times,serif" font-size="20.00">rasterize_with_ghostscript</text>
+</g>
+<!-- t3&#45;&gt;t4 -->
+<g id="edge5" class="edge"><title>t3&#45;&gt;t4</title>
+<path fill="none" stroke="#0044a0" d="M596.903,-531.003C579.736,-525.141 560.358,-518.524 542.518,-512.433"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="543.384,-509.03 532.79,-509.111 541.122,-515.654 543.384,-509.03"/>
+</g>
+<!-- t9 -->
+<g id="node11" class="node"><title>t9</title>
 <polygon fill="#efa03b" stroke="black" points="314.109,-277.109 19.8906,-277.109 15.8906,-273.109 15.8906,-241.109 310.109,-241.109 314.109,-245.109 314.109,-277.109"/>
 <polyline fill="none" stroke="black" points="310.109,-273.109 15.8906,-273.109 "/>
 <polyline fill="none" stroke="black" points="310.109,-273.109 310.109,-241.109 "/>
 <polyline fill="none" stroke="black" points="310.109,-273.109 314.109,-277.109 "/>
 <text text-anchor="middle" x="165" y="-253.109" font-family="Times,serif" font-size="20.00">select_image_layer</text>
 </g>
-<!-- t1&#45;&gt;t7 -->
-<g id="edge11" class="edge"><title>t1&#45;&gt;t7</title>
-<path fill="none" stroke="#0044a0" d="M501.985,-548.028C416.651,-540.897 317.351,-528.985 229,-509.109 131.492,-487.174 17,-534.054 17,-434.109 17,-434.109 17,-434.109 17,-374.109 17,-340.481 4.97908,-324.525 27,-299.109 32.8004,-292.415 39.6539,-286.828 47.1559,-282.17"/>
+<!-- t3&#45;&gt;t9 -->
+<g id="edge14" class="edge"><title>t3&#45;&gt;t9</title>
+<path fill="none" stroke="#0044a0" d="M527.761,-538.463C345.118,-522.06 17,-485.447 17,-434.109 17,-434.109 17,-434.109 17,-374.109 17,-340.481 4.97908,-324.525 27,-299.109 32.8004,-292.415 39.6539,-286.828 47.1559,-282.17"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="49.1993,-285.038 56.2361,-277.118 45.7959,-278.921 49.1993,-285.038"/>
 </g>
+<!-- t15 -->
+<g id="node14" class="node"><title>t15</title>
+<polygon fill="#efa03b" stroke="black" points="1014.34,-451.109 793.662,-451.109 789.662,-447.109 789.662,-415.109 1010.34,-415.109 1014.34,-419.109 1014.34,-451.109"/>
+<polyline fill="none" stroke="black" points="1010.34,-447.109 789.662,-447.109 "/>
+<polyline fill="none" stroke="black" points="1010.34,-447.109 1010.34,-415.109 "/>
+<polyline fill="none" stroke="black" points="1010.34,-447.109 1014.34,-451.109 "/>
+<text text-anchor="middle" x="902" y="-427.109" font-family="Times,serif" font-size="20.00">skip_page</text>
+</g>
+<!-- t3&#45;&gt;t15 -->
+<g id="edge19" class="edge"><title>t3&#45;&gt;t15</title>
+<path fill="none" stroke="#0044a0" d="M685.176,-531.042C730.244,-510.894 805.236,-477.369 854.145,-455.503"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="855.862,-458.57 863.563,-451.293 853.005,-452.179 855.862,-458.57"/>
+</g>
 <!-- t13 -->
-<g id="node12" class="node"><title>t13</title>
-<polygon fill="#efa03b" stroke="black" points="1029.34,-451.109 808.662,-451.109 804.662,-447.109 804.662,-415.109 1025.34,-415.109 1029.34,-419.109 1029.34,-451.109"/>
-<polyline fill="none" stroke="black" points="1025.34,-447.109 804.662,-447.109 "/>
-<polyline fill="none" stroke="black" points="1025.34,-447.109 1025.34,-415.109 "/>
-<polyline fill="none" stroke="black" points="1025.34,-447.109 1029.34,-451.109 "/>
-<text text-anchor="middle" x="917" y="-427.109" font-family="Times,serif" font-size="20.00">skip_page</text>
-</g>
-<!-- t1&#45;&gt;t13 -->
-<g id="edge16" class="edge"><title>t1&#45;&gt;t13</title>
-<path fill="none" stroke="#0044a0" d="M716.849,-535.484C723.825,-515.965 736.562,-488.763 757,-473.109 768.342,-464.422 781.352,-457.641 794.947,-452.352"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="796.223,-455.612 804.44,-448.923 793.845,-449.029 796.223,-455.612"/>
-</g>
-<!-- t11 -->
-<g id="node13" class="node"><title>t11</title>
+<g id="node15" class="node"><title>t13</title>
 <polygon fill="#efa03b" stroke="black" points="1068.24,-335.109 687.76,-335.109 683.76,-331.109 683.76,-299.109 1064.24,-299.109 1068.24,-303.109 1068.24,-335.109"/>
 <polyline fill="none" stroke="black" points="1064.24,-331.109 683.76,-331.109 "/>
 <polyline fill="none" stroke="black" points="1064.24,-331.109 1064.24,-299.109 "/>
 <polyline fill="none" stroke="black" points="1064.24,-331.109 1068.24,-335.109 "/>
 <text text-anchor="middle" x="876" y="-311.109" font-family="Times,serif" font-size="20.00">tesseract_ocr_and_render_pdf</text>
 </g>
-<!-- t1&#45;&gt;t11 -->
-<g id="edge18" class="edge"><title>t1&#45;&gt;t11</title>
-<path fill="none" stroke="#0044a0" d="M710.717,-535.657C712.098,-517.744 715.879,-492.693 726,-473.109 754.651,-417.673 809.56,-368.731 844.403,-341.318"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="846.607,-344.038 852.372,-335.148 842.322,-338.503 846.607,-344.038"/>
+<!-- t3&#45;&gt;t13 -->
+<g id="edge21" class="edge"><title>t3&#45;&gt;t13</title>
+<path fill="none" stroke="#0044a0" d="M664.212,-530.822C704.512,-490.347 805.091,-389.328 851.647,-342.569"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="854.325,-344.84 858.9,-335.284 849.364,-339.901 854.325,-344.84"/>
 </g>
-<!-- t3 -->
-<g id="node4" class="node"><title>t3</title>
-<polygon fill="#efa03b" stroke="black" points="564.742,-451.109 269.258,-451.109 265.258,-447.109 265.258,-415.109 560.742,-415.109 564.742,-419.109 564.742,-451.109"/>
-<polyline fill="none" stroke="black" points="560.742,-447.109 265.258,-447.109 "/>
-<polyline fill="none" stroke="black" points="560.742,-447.109 560.742,-415.109 "/>
-<polyline fill="none" stroke="black" points="560.742,-447.109 564.742,-451.109 "/>
-<text text-anchor="middle" x="415" y="-427.109" font-family="Times,serif" font-size="20.00">preprocess_deskew</text>
+<!-- t5 -->
+<g id="node6" class="node"><title>t5</title>
+<polygon fill="#efa03b" stroke="black" points="632.742,-451.109 337.258,-451.109 333.258,-447.109 333.258,-415.109 628.742,-415.109 632.742,-419.109 632.742,-451.109"/>
+<polyline fill="none" stroke="black" points="628.742,-447.109 333.258,-447.109 "/>
+<polyline fill="none" stroke="black" points="628.742,-447.109 628.742,-415.109 "/>
+<polyline fill="none" stroke="black" points="628.742,-447.109 632.742,-451.109 "/>
+<text text-anchor="middle" x="483" y="-427.109" font-family="Times,serif" font-size="20.00">preprocess_deskew</text>
 </g>
-<!-- t2&#45;&gt;t3 -->
-<g id="edge3" class="edge"><title>t2&#45;&gt;t3</title>
-<path fill="none" stroke="#0044a0" d="M415,-473.003C415,-469.312 415,-465.322 415,-461.352"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="418.5,-461.111 415,-451.111 411.5,-461.111 418.5,-461.111"/>
+<!-- t4&#45;&gt;t5 -->
+<g id="edge6" class="edge"><title>t4&#45;&gt;t5</title>
+<path fill="none" stroke="#0044a0" d="M483,-473.003C483,-469.312 483,-465.322 483,-461.352"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="486.5,-461.111 483,-451.111 479.5,-461.111 486.5,-461.111"/>
 </g>
-<!-- t6 -->
-<g id="node8" class="node"><title>t6</title>
+<!-- t8 -->
+<g id="node10" class="node"><title>t8</title>
 <polygon fill="#efa03b" stroke="black" points="354.119,-335.109 39.8808,-335.109 35.8808,-331.109 35.8808,-299.109 350.119,-299.109 354.119,-303.109 354.119,-335.109"/>
 <polyline fill="none" stroke="black" points="350.119,-331.109 35.8808,-331.109 "/>
 <polyline fill="none" stroke="black" points="350.119,-331.109 350.119,-299.109 "/>
 <polyline fill="none" stroke="black" points="350.119,-331.109 354.119,-335.109 "/>
 <text text-anchor="middle" x="195" y="-311.109" font-family="Times,serif" font-size="20.00">select_image_for_pdf</text>
 </g>
-<!-- t2&#45;&gt;t6 -->
-<g id="edge9" class="edge"><title>t2&#45;&gt;t6</title>
-<path fill="none" stroke="#0044a0" d="M294.315,-473.06C280.461,-467.585 267.283,-460.434 256,-451.109 223.245,-424.039 207.251,-375.559 200.099,-345.207"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="203.486,-344.31 197.925,-335.292 196.648,-345.809 203.486,-344.31"/>
+<!-- t4&#45;&gt;t8 -->
+<g id="edge12" class="edge"><title>t4&#45;&gt;t8</title>
+<path fill="none" stroke="#0044a0" d="M377.567,-473.091C359.004,-467.659 340.397,-460.511 324,-451.109 276.505,-423.877 235.143,-374.068 212.753,-343.738"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="215.39,-341.412 206.69,-335.366 209.721,-345.518 215.39,-341.412"/>
 </g>
-<!-- t4 -->
-<g id="node5" class="node"><title>t4</title>
-<polygon fill="#efa03b" stroke="black" points="587.95,-393.109 310.05,-393.109 306.05,-389.109 306.05,-357.109 583.95,-357.109 587.95,-361.109 587.95,-393.109"/>
-<polyline fill="none" stroke="black" points="583.95,-389.109 306.05,-389.109 "/>
-<polyline fill="none" stroke="black" points="583.95,-389.109 583.95,-357.109 "/>
-<polyline fill="none" stroke="black" points="583.95,-389.109 587.95,-393.109 "/>
-<text text-anchor="middle" x="447" y="-369.109" font-family="Times,serif" font-size="20.00">preprocess_clean</text>
+<!-- t6 -->
+<g id="node7" class="node"><title>t6</title>
+<polygon fill="#efa03b" stroke="black" points="641.95,-393.109 364.05,-393.109 360.05,-389.109 360.05,-357.109 637.95,-357.109 641.95,-361.109 641.95,-393.109"/>
+<polyline fill="none" stroke="black" points="637.95,-389.109 360.05,-389.109 "/>
+<polyline fill="none" stroke="black" points="637.95,-389.109 637.95,-357.109 "/>
+<polyline fill="none" stroke="black" points="637.95,-389.109 641.95,-393.109 "/>
+<text text-anchor="middle" x="501" y="-369.109" font-family="Times,serif" font-size="20.00">preprocess_clean</text>
 </g>
-<!-- t3&#45;&gt;t4 -->
-<g id="edge4" class="edge"><title>t3&#45;&gt;t4</title>
-<path fill="none" stroke="#0044a0" d="M424.775,-415.003C427.132,-410.878 429.703,-406.379 432.232,-401.952"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="435.362,-403.53 437.285,-393.111 429.285,-400.057 435.362,-403.53"/>
+<!-- t5&#45;&gt;t6 -->
+<g id="edge7" class="edge"><title>t5&#45;&gt;t6</title>
+<path fill="none" stroke="#0044a0" d="M488.498,-415.003C489.755,-411.095 491.119,-406.852 492.468,-402.652"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="495.807,-403.702 495.535,-393.111 489.143,-401.56 495.807,-403.702"/>
 </g>
-<!-- t3&#45;&gt;t6 -->
-<g id="edge8" class="edge"><title>t3&#45;&gt;t6</title>
-<path fill="none" stroke="#0044a0" d="M351.041,-415.02C333.086,-409.151 313.871,-401.82 297,-393.109 269.828,-379.08 242.065,-358.149 222.377,-341.96"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="224.378,-339.071 214.46,-335.347 219.891,-344.444 224.378,-339.071"/>
+<!-- t5&#45;&gt;t8 -->
+<g id="edge11" class="edge"><title>t5&#45;&gt;t8</title>
+<path fill="none" stroke="#0044a0" d="M417.515,-415.016C396.072,-408.823 372.295,-401.313 351,-393.109 310.847,-377.641 266.808,-356.013 235.791,-339.955"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="237.087,-336.684 226.602,-335.163 233.85,-342.89 237.087,-336.684"/>
 </g>
-<!-- t5 -->
-<g id="node6" class="node"><title>t5</title>
+<!-- t7 -->
+<g id="node8" class="node"><title>t7</title>
 <polygon fill="#efa03b" stroke="black" points="665.666,-335.109 376.334,-335.109 372.334,-331.109 372.334,-299.109 661.666,-299.109 665.666,-303.109 665.666,-335.109"/>
 <polyline fill="none" stroke="black" points="661.666,-331.109 372.334,-331.109 "/>
 <polyline fill="none" stroke="black" points="661.666,-331.109 661.666,-299.109 "/>
 <polyline fill="none" stroke="black" points="661.666,-331.109 665.666,-335.109 "/>
 <text text-anchor="middle" x="519" y="-311.109" font-family="Times,serif" font-size="20.00">ocr_tesseract_hocr</text>
 </g>
-<!-- t4&#45;&gt;t5 -->
-<g id="edge5" class="edge"><title>t4&#45;&gt;t5</title>
-<path fill="none" stroke="#0044a0" d="M468.994,-357.003C475.274,-352.118 482.229,-346.709 488.905,-341.516"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="491.396,-344.013 497.141,-335.111 487.099,-338.487 491.396,-344.013"/>
+<!-- t6&#45;&gt;t7 -->
+<g id="edge8" class="edge"><title>t6&#45;&gt;t7</title>
+<path fill="none" stroke="#0044a0" d="M506.498,-357.003C507.755,-353.095 509.119,-348.852 510.468,-344.652"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="513.807,-345.702 513.535,-335.111 507.143,-343.56 513.807,-345.702"/>
 </g>
-<!-- t4&#45;&gt;t6 -->
-<g id="edge7" class="edge"><title>t4&#45;&gt;t6</title>
-<path fill="none" stroke="#0044a0" d="M370.365,-357.079C342.386,-350.862 310.55,-343.787 281.752,-337.388"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="282.225,-333.907 271.704,-335.155 280.707,-340.741 282.225,-333.907"/>
+<!-- t6&#45;&gt;t8 -->
+<g id="edge10" class="edge"><title>t6&#45;&gt;t8</title>
+<path fill="none" stroke="#0044a0" d="M407.944,-357.079C373.227,-350.726 333.62,-343.478 298.057,-336.969"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="298.608,-333.512 288.141,-335.155 297.348,-340.398 298.608,-333.512"/>
 </g>
-<!-- t4&#45;&gt;t11 -->
-<g id="edge17" class="edge"><title>t4&#45;&gt;t11</title>
-<path fill="none" stroke="#0044a0" d="M577.461,-357.079C627.38,-350.563 684.512,-343.106 735.342,-336.47"/>
-<polygon fill="#0044a0" stroke="#0044a0" points="735.957,-339.92 745.42,-335.155 735.051,-332.979 735.957,-339.92"/>
+<!-- t6&#45;&gt;t13 -->
+<g id="edge20" class="edge"><title>t6&#45;&gt;t13</title>
+<path fill="none" stroke="#0044a0" d="M615.04,-357.079C658.311,-350.618 707.783,-343.23 751.935,-336.636"/>
+<polygon fill="#0044a0" stroke="#0044a0" points="752.483,-340.093 761.857,-335.155 751.449,-333.17 752.483,-340.093"/>
 </g>
-<!-- t8 -->
-<g id="node7" class="node"><title>t8</title>
+<!-- t10 -->
+<g id="node9" class="node"><title>t10</title>
 <polygon fill="#efa03b" stroke="black" points="986.109,-277.109 701.891,-277.109 697.891,-273.109 697.891,-241.109 982.109,-241.109 986.109,-245.109 986.109,-277.109"/>
 <polyline fill="none" stroke="black" points="982.109,-273.109 697.891,-273.109 "/>
 <polyline fill="none" stroke="black" points="982.109,-273.109 982.109,-241.109 "/>
 <polyline fill="none" stroke="black" points="982.109,-273.109 986.109,-277.109 "/>
 <text text-anchor="middle" x="842" y="-253.109" font-family="Times,serif" font-size="20.00">render_hocr_page</text>
 </g>
-<!-- t5&#45;&gt;t8 -->
-<g id="edge6" class="edge"><title>t5&#45;&gt;t8</title>
+<!-- t7&#45;&gt;t10 -->
+<g id="edge9" class="edge"><title>t7&#45;&gt;t10</title>
 <path fill="none" stroke="#0044a0" d="M617.226,-299.079C654.028,-292.699 696.036,-285.416 733.699,-278.886"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="734.429,-282.312 743.685,-277.155 733.234,-275.415 734.429,-282.312"/>
 </g>
-<!-- t9 -->
-<g id="node11" class="node"><title>t9</title>
+<!-- t11 -->
+<g id="node13" class="node"><title>t11</title>
 <polygon fill="#efa03b" stroke="black" points="679.486,-277.109 336.514,-277.109 332.514,-273.109 332.514,-241.109 675.486,-241.109 679.486,-245.109 679.486,-277.109"/>
 <polyline fill="none" stroke="black" points="675.486,-273.109 332.514,-273.109 "/>
 <polyline fill="none" stroke="black" points="675.486,-273.109 675.486,-241.109 "/>
 <polyline fill="none" stroke="black" points="675.486,-273.109 679.486,-277.109 "/>
 <text text-anchor="middle" x="506" y="-253.109" font-family="Times,serif" font-size="20.00">render_hocr_debug_page</text>
 </g>
-<!-- t5&#45;&gt;t9 -->
-<g id="edge15" class="edge"><title>t5&#45;&gt;t9</title>
+<!-- t7&#45;&gt;t11 -->
+<g id="edge18" class="edge"><title>t7&#45;&gt;t11</title>
 <path fill="none" stroke="#0044a0" d="M515.029,-299.003C514.147,-295.204 513.191,-291.087 512.243,-287.002"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="515.617,-286.06 509.947,-277.111 508.799,-287.643 515.617,-286.06"/>
 </g>
-<!-- t10 -->
-<g id="node10" class="node"><title>t10</title>
+<!-- t12 -->
+<g id="node12" class="node"><title>t12</title>
 <polygon fill="#efa03b" stroke="black" points="973.082,-219.109 714.918,-219.109 710.918,-215.109 710.918,-183.109 969.082,-183.109 973.082,-187.109 973.082,-219.109"/>
 <polyline fill="none" stroke="black" points="969.082,-215.109 710.918,-215.109 "/>
 <polyline fill="none" stroke="black" points="969.082,-215.109 969.082,-183.109 "/>
 <polyline fill="none" stroke="black" points="969.082,-215.109 973.082,-219.109 "/>
 <text text-anchor="middle" x="842" y="-195.109" font-family="Times,serif" font-size="20.00">add_text_layer</text>
 </g>
-<!-- t8&#45;&gt;t10 -->
-<g id="edge12" class="edge"><title>t8&#45;&gt;t10</title>
+<!-- t10&#45;&gt;t12 -->
+<g id="edge15" class="edge"><title>t10&#45;&gt;t12</title>
 <path fill="none" stroke="#0044a0" d="M842,-241.003C842,-237.312 842,-233.322 842,-229.352"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="845.5,-229.111 842,-219.111 838.5,-229.111 845.5,-229.111"/>
 </g>
-<!-- t6&#45;&gt;t7 -->
-<g id="edge10" class="edge"><title>t6&#45;&gt;t7</title>
+<!-- t8&#45;&gt;t9 -->
+<g id="edge13" class="edge"><title>t8&#45;&gt;t9</title>
 <path fill="none" stroke="#0044a0" d="M185.836,-299.003C183.626,-294.878 181.216,-290.379 178.845,-285.952"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="181.915,-284.273 174.108,-277.111 175.745,-287.578 181.915,-284.273"/>
 </g>
-<!-- t6&#45;&gt;t9 -->
-<g id="edge14" class="edge"><title>t6&#45;&gt;t9</title>
+<!-- t8&#45;&gt;t11 -->
+<g id="edge17" class="edge"><title>t8&#45;&gt;t11</title>
 <path fill="none" stroke="#0044a0" d="M289.577,-299.079C324.86,-292.726 365.115,-285.478 401.259,-278.969"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="402.116,-282.372 411.337,-277.155 400.875,-275.482 402.116,-282.372"/>
 </g>
-<!-- t7&#45;&gt;t10 -->
-<g id="edge13" class="edge"><title>t7&#45;&gt;t10</title>
+<!-- t9&#45;&gt;t12 -->
+<g id="edge16" class="edge"><title>t9&#45;&gt;t12</title>
 <path fill="none" stroke="#0044a0" d="M314.366,-242.002C317.606,-241.697 320.821,-241.399 324,-241.109 451.228,-229.521 596.262,-218.816 700.44,-211.565"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="700.976,-215.036 710.71,-210.852 700.491,-208.053 700.976,-215.036"/>
 </g>
-<!-- t14 -->
-<g id="node15" class="node"><title>t14</title>
+<!-- t16 -->
+<g id="node17" class="node"><title>t16</title>
 <polygon fill="#efa03b" stroke="black" points="774.472,-105.333 939,-78.005 1103.53,-105.333 1103.37,-149.551 774.625,-149.551 774.472,-105.333"/>
 <polygon fill="none" stroke="black" points="770.46,-101.94 939,-73.9453 1107.54,-101.94 1107.36,-153.556 770.639,-153.556 770.46,-101.94"/>
 <text text-anchor="middle" x="939" y="-111.555" font-family="Times,serif" font-size="20.00">merge_pages</text>
 </g>
-<!-- t10&#45;&gt;t14 -->
-<g id="edge23" class="edge"><title>t10&#45;&gt;t14</title>
+<!-- t12&#45;&gt;t16 -->
+<g id="edge26" class="edge"><title>t12&#45;&gt;t16</title>
 <path fill="none" stroke="#0044a0" d="M862.571,-182.814C870.479,-176.165 879.888,-168.254 889.356,-160.293"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="891.89,-162.736 897.292,-153.622 887.385,-157.378 891.89,-162.736"/>
 </g>
-<!-- t9&#45;&gt;t14 -->
-<g id="edge24" class="edge"><title>t9&#45;&gt;t14</title>
+<!-- t11&#45;&gt;t16 -->
+<g id="edge27" class="edge"><title>t11&#45;&gt;t16</title>
 <path fill="none" stroke="#0044a0" d="M546.87,-241.103C586.226,-225.054 647.626,-200.869 702,-183.109 730.59,-173.771 761.415,-164.679 791.074,-156.405"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="792.38,-159.675 801.082,-153.632 790.511,-152.93 792.38,-159.675"/>
 </g>
-<!-- t13&#45;&gt;t14 -->
-<g id="edge20" class="edge"><title>t13&#45;&gt;t14</title>
-<path fill="none" stroke="#0044a0" d="M986.636,-414.956C1033.58,-398.748 1087,-369.103 1087,-318.109 1087,-318.109 1087,-318.109 1087,-258.109 1087,-215.948 1054.81,-182.761 1020.46,-159.346"/>
+<!-- t15&#45;&gt;t16 -->
+<g id="edge23" class="edge"><title>t15&#45;&gt;t16</title>
+<path fill="none" stroke="#0044a0" d="M983.413,-415.045C1033.05,-399.337 1087,-370.266 1087,-318.109 1087,-318.109 1087,-318.109 1087,-258.109 1087,-215.948 1054.81,-182.761 1020.46,-159.346"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="1021.98,-156.158 1011.7,-153.604 1018.15,-162.013 1021.98,-156.158"/>
 </g>
-<!-- t11&#45;&gt;t14 -->
-<g id="edge22" class="edge"><title>t11&#45;&gt;t14</title>
+<!-- t13&#45;&gt;t16 -->
+<g id="edge25" class="edge"><title>t13&#45;&gt;t16</title>
 <path fill="none" stroke="#0044a0" d="M969.056,-299.033C979.177,-293.579 988.239,-286.44 995,-277.109 1020.03,-242.568 998.243,-195.803 974.929,-162.021"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="977.661,-159.825 968.996,-153.728 971.968,-163.897 977.661,-159.825"/>
 </g>
-<!-- t12&#45;&gt;t14 -->
-<g id="edge21" class="edge"><title>t12&#45;&gt;t14</title>
-<path fill="none" stroke="#0044a0" d="M1006.64,-472.956C1053.58,-456.748 1107,-427.103 1107,-376.109 1107,-376.109 1107,-376.109 1107,-258.109 1107,-220.742 1095.53,-209.426 1069,-183.109 1059.8,-173.988 1049.06,-165.927 1037.76,-158.869"/>
+<!-- t14&#45;&gt;t16 -->
+<g id="edge24" class="edge"><title>t14&#45;&gt;t16</title>
+<path fill="none" stroke="#0044a0" d="M1006.64,-588.956C1053.58,-572.748 1107,-543.103 1107,-492.109 1107,-492.109 1107,-492.109 1107,-258.109 1107,-220.742 1095.53,-209.426 1069,-183.109 1059.8,-173.988 1049.06,-165.927 1037.76,-158.869"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="1039.41,-155.777 1029.03,-153.665 1035.83,-161.791 1039.41,-155.777"/>
 </g>
-<!-- t15 -->
-<g id="node16" class="node"><title>t15</title>
+<!-- t17 -->
+<g id="node18" class="node"><title>t17</title>
 <polygon fill="#efa03b" stroke="black" points="1053.18,-52 828.822,-52 824.822,-48 824.822,-16 1049.18,-16 1053.18,-20 1053.18,-52"/>
 <polyline fill="none" stroke="black" points="1049.18,-48 824.822,-48 "/>
 <polyline fill="none" stroke="black" points="1049.18,-48 1049.18,-16 "/>
 <polyline fill="none" stroke="black" points="1049.18,-48 1053.18,-52 "/>
 <text text-anchor="middle" x="939" y="-28" font-family="Times,serif" font-size="20.00">copy_final</text>
 </g>
-<!-- t14&#45;&gt;t15 -->
-<g id="edge25" class="edge"><title>t14&#45;&gt;t15</title>
+<!-- t16&#45;&gt;t17 -->
+<g id="edge28" class="edge"><title>t16&#45;&gt;t17</title>
 <path fill="none" stroke="#0044a0" d="M939,-73.8665C939,-69.8921 939,-65.942 939,-62.1676"/>
 <polygon fill="#0044a0" stroke="#0044a0" points="942.5,-62.1213 939,-52.1214 935.5,-62.1214 942.5,-62.1213"/>
 </g>