mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-30 00:31:59 +00:00
Trailing whitespace
This commit is contained in:
parent
6333ec928c
commit
807c8b0726
@ -54,7 +54,7 @@ class HocrTransform():
|
||||
|
||||
box_pattern = re.compile(r'bbox((\s+\d+){4})')
|
||||
baseline_pattern = re.compile(r'''
|
||||
baseline \s+
|
||||
baseline \s+
|
||||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
|
||||
([\-\+]?\d+) # +/- int''', re.VERBOSE)
|
||||
ligatures = str.maketrans({
|
||||
@ -138,7 +138,7 @@ class HocrTransform():
|
||||
matches = cls.baseline_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return float(matches.group(1)), int(matches.group(2))
|
||||
return (0, 0)
|
||||
return (0, 0)
|
||||
|
||||
def pt_from_pixel(self, pxl):
|
||||
"""
|
||||
@ -192,12 +192,12 @@ class HocrTransform():
|
||||
pdf.rect(
|
||||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1,
|
||||
fill=1)
|
||||
|
||||
|
||||
found_lines = False
|
||||
for line in self.hocr.findall(
|
||||
".//%sspan[@class='%s']" % (self.xmlns, "ocr_line")):
|
||||
found_lines = True
|
||||
self._do_line(pdf, line, "ocrx_word", fontname, invisibleText,
|
||||
self._do_line(pdf, line, "ocrx_word", fontname, invisibleText,
|
||||
interwordSpaces, showBoundingboxes)
|
||||
|
||||
if not found_lines:
|
||||
@ -220,7 +220,7 @@ class HocrTransform():
|
||||
return x * poly[0] + poly[1]
|
||||
|
||||
|
||||
def _do_line(self, pdf, line, elemclass, fontname, invisibleText,
|
||||
def _do_line(self, pdf, line, elemclass, fontname, invisibleText,
|
||||
interwordSpaces, showBoundingboxes):
|
||||
pxl_line_coords = self.element_coordinates(line)
|
||||
line_box = self.pt_from_pixel(pxl_line_coords)
|
||||
@ -254,10 +254,10 @@ class HocrTransform():
|
||||
pdf.setLineWidth(0.5)
|
||||
# negate slope because it is defined as a rise/run in pixel
|
||||
# coordinates and page coordinates have the y axis flipped
|
||||
pdf.line(line_box.x1,
|
||||
pdf.line(line_box.x1,
|
||||
baseline_y2,
|
||||
line_box.x2,
|
||||
self.polyval((-slope, baseline_y2),
|
||||
self.polyval((-slope, baseline_y2),
|
||||
line_box.x2 - line_box.x1))
|
||||
# light green for bounding box of word/line
|
||||
pdf.setDash(6, 3)
|
||||
@ -281,16 +281,16 @@ class HocrTransform():
|
||||
box = self.pt_from_pixel(pxl_coords)
|
||||
if interwordSpaces:
|
||||
# if `--interword-spaces` is true, append a space
|
||||
# to the end of each text element to allow simpler PDF viewers
|
||||
# such as PDF.js to better recognize words in search and copy
|
||||
# to the end of each text element to allow simpler PDF viewers
|
||||
# such as PDF.js to better recognize words in search and copy
|
||||
# and paste. Do not remove space from last word in line, even
|
||||
# though it would look better, because it will interfere with
|
||||
# naive text extraction. \n does not work either.
|
||||
elemtxt += ' '
|
||||
box = Rect._make((
|
||||
box.x1,
|
||||
box.x1,
|
||||
line_box.y1,
|
||||
box.x2 + pdf.stringWidth(' ', fontname, line_height),
|
||||
box.x2 + pdf.stringWidth(' ', fontname, line_height),
|
||||
line_box.y2))
|
||||
box_width = box.x2 - box.x1
|
||||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
|
||||
@ -311,14 +311,14 @@ class HocrTransform():
|
||||
# content stream while this issues a "offset" (Td) command.
|
||||
# .moveCursor() is relative to start of the text line, where the
|
||||
# "text line" means whatever reportlab defines it as. Do not use
|
||||
# use .getCursor(), since moveCursor() rather unintuitively plans
|
||||
# use .getCursor(), since moveCursor() rather unintuitively plans
|
||||
# its moves relative to .getStartOfLine().
|
||||
# For skewed lines, in the text transform we set up a rotated
|
||||
# coordinate system, so we don't have to account for the
|
||||
# coordinate system, so we don't have to account for the
|
||||
# incremental offset. Surprisingly most PDF viewers can handle this.
|
||||
cursor = text.getStartOfLine()
|
||||
dx = box.x1 - cursor[0]
|
||||
dy = baseline_y2 - cursor[1]
|
||||
dy = baseline_y2 - cursor[1]
|
||||
text.moveCursor(dx, dy)
|
||||
|
||||
# If reportlab tells us this word is 0 units wide, our best seems
|
||||
|
||||
@ -121,11 +121,11 @@ def main():
|
||||
|
||||
argv_slug = '__'.join(slugs())
|
||||
argv_slug = argv_slug.replace('/', '___')
|
||||
|
||||
cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
|
||||
cache_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Tesseract cache folder {} - ".format(cache_folder), end='',
|
||||
cache_folder = Path(CACHE_ROOT) / Path(source).stem / argv_slug
|
||||
cache_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Tesseract cache folder {} - ".format(cache_folder), end='',
|
||||
file=sys.stderr)
|
||||
|
||||
if (cache_folder / 'stderr.bin').exists() and not cache_disabled:
|
||||
@ -141,7 +141,7 @@ def main():
|
||||
for configfile in args.configfiles:
|
||||
# cp cache -> output
|
||||
tessfile = args.outputbase + '.' + configfile
|
||||
shutil.copy(str(cache_folder / configfile) + '.bin',
|
||||
shutil.copy(str(cache_folder / configfile) + '.bin',
|
||||
tessfile)
|
||||
sys.exit(0)
|
||||
|
||||
@ -184,7 +184,7 @@ def main():
|
||||
manifest['sourcefile'] = str(Path(source).relative_to(TESTS_ROOT))
|
||||
def clean_sys_argv():
|
||||
for arg in sys.argv[1:]:
|
||||
yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)',
|
||||
yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)',
|
||||
r'$TMPDIR/\1', arg)
|
||||
manifest['args'] = list(clean_sys_argv())
|
||||
with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user