Trailing whitespace

This commit is contained in:
James R. Barlow 2018-06-23 01:51:19 -07:00
parent 6333ec928c
commit 807c8b0726
2 changed files with 20 additions and 20 deletions

View File

@ -54,7 +54,7 @@ class HocrTransform():
box_pattern = re.compile(r'bbox((\s+\d+){4})')
baseline_pattern = re.compile(r'''
baseline \s+
baseline \s+
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
([\-\+]?\d+) # +/- int''', re.VERBOSE)
ligatures = str.maketrans({
@ -138,7 +138,7 @@ class HocrTransform():
matches = cls.baseline_pattern.search(element.attrib['title'])
if matches:
return float(matches.group(1)), int(matches.group(2))
return (0, 0)
return (0, 0)
def pt_from_pixel(self, pxl):
"""
@ -192,12 +192,12 @@ class HocrTransform():
pdf.rect(
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1,
fill=1)
found_lines = False
for line in self.hocr.findall(
".//%sspan[@class='%s']" % (self.xmlns, "ocr_line")):
found_lines = True
self._do_line(pdf, line, "ocrx_word", fontname, invisibleText,
self._do_line(pdf, line, "ocrx_word", fontname, invisibleText,
interwordSpaces, showBoundingboxes)
if not found_lines:
@ -220,7 +220,7 @@ class HocrTransform():
return x * poly[0] + poly[1]
def _do_line(self, pdf, line, elemclass, fontname, invisibleText,
def _do_line(self, pdf, line, elemclass, fontname, invisibleText,
interwordSpaces, showBoundingboxes):
pxl_line_coords = self.element_coordinates(line)
line_box = self.pt_from_pixel(pxl_line_coords)
@ -254,10 +254,10 @@ class HocrTransform():
pdf.setLineWidth(0.5)
# negate slope because it is defined as a rise/run in pixel
# coordinates and page coordinates have the y axis flipped
pdf.line(line_box.x1,
pdf.line(line_box.x1,
baseline_y2,
line_box.x2,
self.polyval((-slope, baseline_y2),
self.polyval((-slope, baseline_y2),
line_box.x2 - line_box.x1))
# light green for bounding box of word/line
pdf.setDash(6, 3)
@ -281,16 +281,16 @@ class HocrTransform():
box = self.pt_from_pixel(pxl_coords)
if interwordSpaces:
# if `--interword-spaces` is true, append a space
# to the end of each text element to allow simpler PDF viewers
# such as PDF.js to better recognize words in search and copy
# to the end of each text element to allow simpler PDF viewers
# such as PDF.js to better recognize words in search and copy
# and paste. Do not remove space from last word in line, even
# though it would look better, because it will interfere with
# naive text extraction. \n does not work either.
elemtxt += ' '
box = Rect._make((
box.x1,
box.x1,
line_box.y1,
box.x2 + pdf.stringWidth(' ', fontname, line_height),
box.x2 + pdf.stringWidth(' ', fontname, line_height),
line_box.y2))
box_width = box.x2 - box.x1
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
@ -311,14 +311,14 @@ class HocrTransform():
# content stream while this issues a "offset" (Td) command.
# .moveCursor() is relative to start of the text line, where the
# "text line" means whatever reportlab defines it as. Do not use
# use .getCursor(), since moveCursor() rather unintuitively plans
# use .getCursor(), since moveCursor() rather unintuitively plans
# its moves relative to .getStartOfLine().
# For skewed lines, in the text transform we set up a rotated
# coordinate system, so we don't have to account for the
# coordinate system, so we don't have to account for the
# incremental offset. Surprisingly most PDF viewers can handle this.
cursor = text.getStartOfLine()
dx = box.x1 - cursor[0]
dy = baseline_y2 - cursor[1]
dy = baseline_y2 - cursor[1]
text.moveCursor(dx, dy)
# If reportlab tells us this word is 0 units wide, our best seems

View File

@ -121,11 +121,11 @@ def main():
argv_slug = '__'.join(slugs())
argv_slug = argv_slug.replace('/', '___')
cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
cache_folder.mkdir(parents=True, exist_ok=True)
print("Tesseract cache folder {} - ".format(cache_folder), end='',
cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
cache_folder.mkdir(parents=True, exist_ok=True)
print("Tesseract cache folder {} - ".format(cache_folder), end='',
file=sys.stderr)
if (cache_folder / 'stderr.bin').exists() and not cache_disabled:
@ -141,7 +141,7 @@ def main():
for configfile in args.configfiles:
# cp cache -> output
tessfile = args.outputbase + '.' + configfile
shutil.copy(str(cache_folder / configfile) + '.bin',
shutil.copy(str(cache_folder / configfile) + '.bin',
tessfile)
sys.exit(0)
@ -184,7 +184,7 @@ def main():
manifest['sourcefile'] = str(Path(source).relative_to(TESTS_ROOT))
def clean_sys_argv():
for arg in sys.argv[1:]:
yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)',
yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)',
r'$TMPDIR/\1', arg)
manifest['args'] = list(clean_sys_argv())
with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: