mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 10:50:29 +00:00
Drop libxml2 dependency
It seems that Python's internal XML parser is good enough to do the job.
This commit is contained in:
parent
53c88093ad
commit
2dff3e07ce
@ -96,7 +96,6 @@ Install dependencies::
|
||||
sudo apt-get install \
|
||||
zlib1g-dev \
|
||||
libjpeg-dev \
|
||||
libxml2 \
|
||||
tesseract-ocr \
|
||||
qpdf \
|
||||
unpaper \
|
||||
|
||||
@ -47,6 +47,7 @@ Changes
|
||||
- MuPDF_ tools
|
||||
- shell scripts
|
||||
- Java and JHOVE_
|
||||
- libxml2
|
||||
|
||||
- Some new external dependencies are required or optional, compared to v2.x:
|
||||
|
||||
@ -66,6 +67,10 @@ Changes
|
||||
Release candidates
|
||||
------------------
|
||||
|
||||
- rc6:
|
||||
|
||||
- dropped libxml2 (Python lxml) since Python 3's internal XML parser is sufficient
|
||||
|
||||
- rc5:
|
||||
|
||||
- dropped Java and JHOVE in favour of qpdf
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
##############################################################################
|
||||
from reportlab.pdfgen.canvas import Canvas
|
||||
from reportlab.lib.units import inch
|
||||
from lxml import etree as ElementTree
|
||||
from xml.etree import ElementTree
|
||||
from PIL import Image
|
||||
from collections import namedtuple
|
||||
import re
|
||||
@ -35,8 +35,7 @@ class HocrTransform():
|
||||
self.dpi = dpi
|
||||
self.boxPattern = re.compile(r'bbox((\s+\d+){4})')
|
||||
|
||||
self.hocr = ElementTree.ElementTree()
|
||||
self.hocr.parse(hocrFileName)
|
||||
self.hocr = ElementTree.parse(hocrFileName)
|
||||
|
||||
# if the hOCR file has a namespace, ElementTree requires its use to
|
||||
# find elements
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user