Trailing whitespace

2025-12-30 00:31:59 +00:00 · 2018-06-23 01:51:19 -07:00 · 2018-06-23 01:51:19 -07:00 · 807c8b0726
commit 807c8b0726
parent 6333ec928c
2 changed files with 20 additions and 20 deletions
--- a/src/ocrmypdf/hocrtransform.py
+++ b/src/ocrmypdf/hocrtransform.py
@ -54,7 +54,7 @@ class HocrTransform():

    box_pattern = re.compile(r'bbox((\s+\d+){4})')
    baseline_pattern = re.compile(r'''
-        baseline \s+ 
+        baseline \s+
        ([\-\+]?\d*\.?\d*) \s+  # +/- decimal float
        ([\-\+]?\d+)            # +/- int''', re.VERBOSE)
    ligatures = str.maketrans({
@ -138,7 +138,7 @@ class HocrTransform():
            matches = cls.baseline_pattern.search(element.attrib['title'])
            if matches:
                return float(matches.group(1)), int(matches.group(2))
-        return (0, 0) 
+        return (0, 0)

    def pt_from_pixel(self, pxl):
        """
@ -192,12 +192,12 @@ class HocrTransform():
                pdf.rect(
                    pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1,
                    fill=1)
-                    
+
        found_lines = False
        for line in self.hocr.findall(
                ".//%sspan[@class='%s']" % (self.xmlns, "ocr_line")):
            found_lines = True
-            self._do_line(pdf, line, "ocrx_word", fontname, invisibleText, 
+            self._do_line(pdf, line, "ocrx_word", fontname, invisibleText,
                          interwordSpaces, showBoundingboxes)

        if not found_lines:
@ -220,7 +220,7 @@ class HocrTransform():
        return x * poly[0] + poly[1]


-    def _do_line(self, pdf, line, elemclass, fontname, invisibleText, 
+    def _do_line(self, pdf, line, elemclass, fontname, invisibleText,
                 interwordSpaces, showBoundingboxes):
        pxl_line_coords = self.element_coordinates(line)
        line_box = self.pt_from_pixel(pxl_line_coords)
@ -254,10 +254,10 @@ class HocrTransform():
            pdf.setLineWidth(0.5)
            # negate slope because it is defined as a rise/run in pixel
            # coordinates and page coordinates have the y axis flipped
-            pdf.line(line_box.x1, 
+            pdf.line(line_box.x1,
                     baseline_y2,
                     line_box.x2,
-                     self.polyval((-slope, baseline_y2), 
+                     self.polyval((-slope, baseline_y2),
                                  line_box.x2 - line_box.x1))
            # light green for bounding box of word/line
            pdf.setDash(6, 3)
@ -281,16 +281,16 @@ class HocrTransform():
            box = self.pt_from_pixel(pxl_coords)
            if interwordSpaces:
                # if  `--interword-spaces` is true, append a space
-                # to the end of each text element to allow simpler PDF viewers 
-                # such as PDF.js to better recognize words in search and copy 
+                # to the end of each text element to allow simpler PDF viewers
+                # such as PDF.js to better recognize words in search and copy
                # and paste. Do not remove space from last word in line, even
                # though it would look better, because it will interfere with
                # naive text extraction. \n does not work either.
                elemtxt += ' '
                box = Rect._make((
-                    box.x1, 
+                    box.x1,
                    line_box.y1,
-                    box.x2 + pdf.stringWidth(' ', fontname, line_height), 
+                    box.x2 + pdf.stringWidth(' ', fontname, line_height),
                    line_box.y2))
            box_width = box.x2 - box.x1
            font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
@ -311,14 +311,14 @@ class HocrTransform():
            # content stream while this issues a "offset" (Td) command.
            # .moveCursor() is relative to start of the text line, where the
            # "text line" means whatever reportlab defines it as. Do not use
-            # use .getCursor(), since moveCursor() rather unintuitively plans 
+            # use .getCursor(), since moveCursor() rather unintuitively plans
            # its moves relative to .getStartOfLine().
            # For skewed lines, in the text transform we set up a rotated
-            # coordinate system, so we don't have to account for the 
+            # coordinate system, so we don't have to account for the
            # incremental offset. Surprisingly most PDF viewers can handle this.
            cursor = text.getStartOfLine()
            dx = box.x1 - cursor[0]
-            dy = baseline_y2 - cursor[1]            
+            dy = baseline_y2 - cursor[1]
            text.moveCursor(dx, dy)

            # If reportlab tells us this word is 0 units wide, our best seems
--- a/tests/spoof/tesseract_cache.py
+++ b/tests/spoof/tesseract_cache.py
@ -121,11 +121,11 @@ def main():

    argv_slug = '__'.join(slugs())
    argv_slug = argv_slug.replace('/', '___')
-    
-    cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
-    cache_folder.mkdir(parents=True, exist_ok=True)        

-    print("Tesseract cache folder {} - ".format(cache_folder), end='', 
+    cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
+    cache_folder.mkdir(parents=True, exist_ok=True)
+
+    print("Tesseract cache folder {} - ".format(cache_folder), end='',
          file=sys.stderr)

    if (cache_folder / 'stderr.bin').exists() and not cache_disabled:
@ -141,7 +141,7 @@ def main():
            for configfile in args.configfiles:
                # cp cache -> output
                tessfile = args.outputbase + '.' + configfile
-                shutil.copy(str(cache_folder / configfile) + '.bin', 
+                shutil.copy(str(cache_folder / configfile) + '.bin',
                            tessfile)
        sys.exit(0)

@ -184,7 +184,7 @@ def main():
    manifest['sourcefile'] = str(Path(source).relative_to(TESTS_ROOT))
    def clean_sys_argv():
        for arg in sys.argv[1:]:
-            yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)', 
+            yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)',
                         r'$TMPDIR/\1', arg)
    manifest['args'] = list(clean_sys_argv())
    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: