OCRmyPDF/src/tesseract.py

41 lines
952 B
Python
Raw Normal View History

2015-07-23 17:06:00 -07:00
#!/usr/bin/env python3
2015-07-23 18:38:59 -07:00
from subprocess import Popen, PIPE, CalledProcessError
2015-07-23 17:06:00 -07:00
import sys
2015-07-23 18:38:59 -07:00
import os
import re
2015-07-23 17:06:00 -07:00
2015-07-23 18:38:59 -07:00
def _version():
args_tess = [
'tesseract',
'--version'
]
p_tess = Popen(args_tess, close_fds=True, universal_newlines=True,
stdout=PIPE, stderr=PIPE)
_, versions = p_tess.communicate(timeout=5)
tesseract_version = re.match(r'tesseract\s(.+)', versions).group(1)
return tesseract_version
2015-07-23 17:06:00 -07:00
2015-07-23 18:38:59 -07:00
def _languages():
args_tess = [
'tesseract',
'--list-langs'
]
p_tess = Popen(args_tess, close_fds=True, universal_newlines=True,
stdout=PIPE, stderr=PIPE)
_, langs = p_tess.communicate(timeout=5)
2015-07-23 17:06:00 -07:00
2015-07-23 18:38:59 -07:00
return set(lang.strip() for lang in langs.splitlines()[1:])
try:
VERSION = _version()
LANGUAGES = _languages()
except Exception as e:
print(e)
print("Could not find tesseract executable", file=sys.stderr)
sys.exit(1)