2013-11-27 22:34:21 +01:00
|
|
|
#!/usr/bin/python
|
2013-04-26 11:49:21 +02:00
|
|
|
##############################################################################
|
|
|
|
|
# Copyright (c) 2013: fritz-hh from Github (https://github.com/fritz-hh)
|
2013-04-10 16:19:46 +03:00
|
|
|
#
|
2013-04-26 11:49:21 +02:00
|
|
|
# Copyright (c) 2010: Jonathan Brinley from Github (https://github.com/jbrinley/HocrConverter)
|
2013-04-10 16:19:46 +03:00
|
|
|
# Initial version by Jonathan Brinley, jonathanbrinley@gmail.com
|
2013-04-26 11:49:21 +02:00
|
|
|
##############################################################################
|
2013-04-09 19:00:26 +02:00
|
|
|
from reportlab.pdfgen.canvas import Canvas
|
|
|
|
|
from reportlab.lib.units import inch
|
|
|
|
|
from lxml import etree as ElementTree
|
2013-11-27 22:34:21 +01:00
|
|
|
from PIL import Image
|
|
|
|
|
import re, sys
|
2013-04-11 20:24:37 +02:00
|
|
|
import argparse
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-09 21:35:22 +02:00
|
|
|
class hocrTransform():
|
2013-04-09 19:00:26 +02:00
|
|
|
"""
|
2013-04-09 21:51:39 +02:00
|
|
|
A class for converting documents from the hOCR format.
|
2013-04-09 19:00:26 +02:00
|
|
|
For details of the hOCR format, see:
|
|
|
|
|
http://docs.google.com/View?docid=dfxcv4vc_67g844kf
|
|
|
|
|
"""
|
2013-04-11 20:24:37 +02:00
|
|
|
def __init__(self, hocrFileName, dpi):
|
2013-04-09 21:35:22 +02:00
|
|
|
self.dpi = dpi
|
2013-04-09 19:00:26 +02:00
|
|
|
self.boxPattern = re.compile('bbox((\s+\d+){4})')
|
2013-04-09 21:51:39 +02:00
|
|
|
|
|
|
|
|
self.hocr = ElementTree.ElementTree()
|
|
|
|
|
self.hocr.parse(hocrFileName)
|
|
|
|
|
|
|
|
|
|
# if the hOCR file has a namespace, ElementTree requires its use to find elements
|
|
|
|
|
matches = re.match('({.*})html', self.hocr.getroot().tag)
|
|
|
|
|
self.xmlns = ''
|
|
|
|
|
if matches:
|
|
|
|
|
self.xmlns = matches.group(1)
|
2013-04-10 16:33:03 +03:00
|
|
|
|
|
|
|
|
# get dimension in pt (not pixel!!!!) of the OCRed image
|
|
|
|
|
for div in self.hocr.findall(".//%sdiv[@class='ocr_page']"%(self.xmlns)):
|
|
|
|
|
coords = self.element_coordinates(div)
|
|
|
|
|
self.width = self.px2pt(coords[2]-coords[0])
|
|
|
|
|
self.height = self.px2pt(coords[3]-coords[1])
|
|
|
|
|
break # there shouldn't be more than one, and if there is, we don't want it
|
2013-04-28 14:56:14 +02:00
|
|
|
|
|
|
|
|
# no width and heigh definition in the ocr_image element of the hocr file
|
2013-04-10 16:33:03 +03:00
|
|
|
if self.width is None:
|
2013-11-27 22:34:21 +01:00
|
|
|
print("No page dimension found in the hocr file")
|
2013-04-28 14:56:14 +02:00
|
|
|
sys.exit(1)
|
2013-04-09 21:35:22 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
def __str__(self):
|
|
|
|
|
"""
|
|
|
|
|
Return the textual content of the HTML body
|
|
|
|
|
"""
|
|
|
|
|
if self.hocr is None:
|
|
|
|
|
return ''
|
|
|
|
|
body = self.hocr.find(".//%sbody"%(self.xmlns))
|
|
|
|
|
if body:
|
|
|
|
|
return self._get_element_text(body).encode('utf-8') # XML gives unicode
|
|
|
|
|
else:
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
def _get_element_text(self, element):
|
|
|
|
|
"""
|
|
|
|
|
Return the textual content of the element and its children
|
|
|
|
|
"""
|
|
|
|
|
text = ''
|
|
|
|
|
if element.text is not None:
|
|
|
|
|
text = text + element.text
|
|
|
|
|
for child in element.getchildren():
|
|
|
|
|
text = text + self._get_element_text(child)
|
|
|
|
|
if element.tail is not None:
|
|
|
|
|
text = text + element.tail
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
def element_coordinates(self, element):
|
|
|
|
|
"""
|
|
|
|
|
Returns a tuple containing the coordinates of the bounding box around
|
|
|
|
|
an element
|
|
|
|
|
"""
|
|
|
|
|
out = (0,0,0,0)
|
|
|
|
|
if 'title' in element.attrib:
|
|
|
|
|
matches = self.boxPattern.search(element.attrib['title'])
|
|
|
|
|
if matches:
|
|
|
|
|
coords = matches.group(1).split()
|
|
|
|
|
out = (int(coords[0]),int(coords[1]),int(coords[2]),int(coords[3]))
|
|
|
|
|
return out
|
|
|
|
|
|
2013-04-09 21:51:39 +02:00
|
|
|
def px2pt(self, pxl):
|
2013-04-09 21:35:22 +02:00
|
|
|
"""
|
|
|
|
|
Returns the length in pt given length in pxl
|
|
|
|
|
"""
|
|
|
|
|
return float(pxl)/self.dpi*inch
|
|
|
|
|
|
2013-04-26 11:49:21 +02:00
|
|
|
def to_pdf(self, outFileName, imageFileName, showBoundingboxes, fontname="Helvetica"):
|
2013-04-09 19:00:26 +02:00
|
|
|
"""
|
|
|
|
|
Creates a PDF file with an image superimposed on top of the text.
|
|
|
|
|
Text is positioned according to the bounding box of the lines in
|
|
|
|
|
the hOCR file.
|
|
|
|
|
The image need not be identical to the image used to create the hOCR file.
|
2013-04-09 21:35:22 +02:00
|
|
|
It can have a lower resolution, different color mode, etc.
|
2013-04-09 19:00:26 +02:00
|
|
|
"""
|
|
|
|
|
# create the PDF file
|
2013-04-10 16:33:03 +03:00
|
|
|
pdf = Canvas(outFileName, pagesize=(self.width, self.height), pageCompression=1) # page size in points (1/72 in.)
|
2013-04-26 11:49:21 +02:00
|
|
|
|
2013-04-22 22:48:41 +02:00
|
|
|
# draw bounding box for each paragraph
|
|
|
|
|
pdf.setStrokeColorRGB(0,1,1) # light blue for bounding box of paragraph
|
|
|
|
|
pdf.setFillColorRGB(0,1,1) # light blue for bounding box of paragraph
|
|
|
|
|
pdf.setLineWidth(0) # no line for bounding box
|
|
|
|
|
for elem in self.hocr.findall(".//%sp[@class='%s']" % (self.xmlns, "ocr_par")):
|
|
|
|
|
|
|
|
|
|
elemtxt=self._get_element_text(elem).rstrip()
|
|
|
|
|
if len(elemtxt) == 0:
|
|
|
|
|
continue
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-22 22:48:41 +02:00
|
|
|
coords = self.element_coordinates(elem)
|
|
|
|
|
x1=self.px2pt(coords[0])
|
|
|
|
|
y1=self.px2pt(coords[1])
|
|
|
|
|
x2=self.px2pt(coords[2])
|
|
|
|
|
y2=self.px2pt(coords[3])
|
|
|
|
|
|
|
|
|
|
# draw the bbox border
|
|
|
|
|
if showBoundingboxes == True:
|
|
|
|
|
pdf.rect(x1, self.height-y2, x2-x1, y2-y1, fill=1)
|
|
|
|
|
|
|
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# check if element with class 'ocrx_word' are available
|
|
|
|
|
# otherwise use 'ocr_line' as fallback
|
|
|
|
|
elemclass="ocr_line"
|
|
|
|
|
if self.hocr.find(".//%sspan[@class='ocrx_word']" %(self.xmlns)) is not None:
|
|
|
|
|
elemclass="ocrx_word"
|
|
|
|
|
|
2013-04-09 21:35:22 +02:00
|
|
|
# itterate all text elements
|
2013-04-22 22:48:41 +02:00
|
|
|
pdf.setStrokeColorRGB(1,0,0) # light green for bounding box of word/line
|
|
|
|
|
pdf.setLineWidth(0.5) # bounding box line width
|
|
|
|
|
pdf.setDash(6,3) # bounding box is dashed
|
|
|
|
|
pdf.setFillColorRGB(0,0,0) # text in black
|
2013-04-09 19:00:26 +02:00
|
|
|
for elem in self.hocr.findall(".//%sspan[@class='%s']" % (self.xmlns, elemclass)):
|
|
|
|
|
|
|
|
|
|
elemtxt=self._get_element_text(elem).rstrip()
|
|
|
|
|
if len(elemtxt) == 0:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
coords = self.element_coordinates(elem)
|
2013-04-09 21:51:39 +02:00
|
|
|
x1=self.px2pt(coords[0])
|
|
|
|
|
y1=self.px2pt(coords[1])
|
|
|
|
|
x2=self.px2pt(coords[2])
|
|
|
|
|
y2=self.px2pt(coords[3])
|
2013-04-09 21:35:22 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# draw the bbox border
|
2013-04-11 20:24:37 +02:00
|
|
|
if showBoundingboxes == True:
|
|
|
|
|
pdf.rect(x1, self.height-y2, x2-x1, y2-y1, fill=0)
|
2013-04-09 19:00:26 +02:00
|
|
|
|
|
|
|
|
text = pdf.beginText()
|
2013-04-09 21:51:39 +02:00
|
|
|
fontsize=self.px2pt(coords[3]-coords[1])
|
2013-04-09 19:00:26 +02:00
|
|
|
text.setFont(fontname, fontsize)
|
|
|
|
|
|
|
|
|
|
# set cursor to bottom left corner of bbox (adjust for dpi)
|
2013-04-10 16:33:03 +03:00
|
|
|
text.setTextOrigin(x1, self.height-y2)
|
2013-04-09 19:00:26 +02:00
|
|
|
|
|
|
|
|
# scale the width of the text to fill the width of the bbox
|
2013-04-09 21:35:22 +02:00
|
|
|
text.setHorizScale(100*(x2-x1)/pdf.stringWidth(elemtxt, fontname, fontsize))
|
2013-04-09 19:00:26 +02:00
|
|
|
|
|
|
|
|
# write the text to the page
|
|
|
|
|
text.textLine(elemtxt)
|
|
|
|
|
pdf.drawText(text)
|
|
|
|
|
|
2013-04-22 22:48:41 +02:00
|
|
|
# put the image on the page, scaled to fill the page
|
|
|
|
|
if imageFileName != None:
|
|
|
|
|
im = Image.open(imageFileName)
|
|
|
|
|
pdf.drawInlineImage(im, 0, 0, width=self.width, height=self.height)
|
|
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# finish up the page and save it
|
|
|
|
|
pdf.showPage()
|
|
|
|
|
pdf.save()
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2013-04-11 20:24:37 +02:00
|
|
|
parser = argparse.ArgumentParser(description='Convert hocr file to PDF')
|
|
|
|
|
parser.add_argument('-b', '--boundingboxes', action="store_true", default=False, help='Show bounding boxes borders')
|
|
|
|
|
parser.add_argument('-r', '--resolution', type=int, default=300, help='Resolution of the image that was OCRed')
|
|
|
|
|
parser.add_argument('-i', '--image', default=None, help='Path to the image to be placed above the text')
|
|
|
|
|
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
|
|
|
|
|
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
hocr = hocrTransform(args.hocrfile, args.resolution)
|
2013-04-22 22:48:41 +02:00
|
|
|
hocr.to_pdf(args.outputfile, args.image, args.boundingboxes)
|
2013-04-11 20:24:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|