mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-26 23:16:48 +00:00
Drop libxml2 dependency
It seems that Python's internal XML parser is good enough to do the job.
This commit is contained in:
parent
53c88093ad
commit
2dff3e07ce
@ -96,7 +96,6 @@ Install dependencies::
|
|||||||
sudo apt-get install \
|
sudo apt-get install \
|
||||||
zlib1g-dev \
|
zlib1g-dev \
|
||||||
libjpeg-dev \
|
libjpeg-dev \
|
||||||
libxml2 \
|
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
qpdf \
|
qpdf \
|
||||||
unpaper \
|
unpaper \
|
||||||
|
|||||||
@ -47,6 +47,7 @@ Changes
|
|||||||
- MuPDF_ tools
|
- MuPDF_ tools
|
||||||
- shell scripts
|
- shell scripts
|
||||||
- Java and JHOVE_
|
- Java and JHOVE_
|
||||||
|
- libxml2
|
||||||
|
|
||||||
- Some new external dependencies are required or optional, compared to v2.x:
|
- Some new external dependencies are required or optional, compared to v2.x:
|
||||||
|
|
||||||
@ -66,6 +67,10 @@ Changes
|
|||||||
Release candidates
|
Release candidates
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
- rc6:
|
||||||
|
|
||||||
|
- dropped libxml2 (Python lxml) since Python 3's internal XML parser is sufficient
|
||||||
|
|
||||||
- rc5:
|
- rc5:
|
||||||
|
|
||||||
- dropped Java and JHOVE in favour of qpdf
|
- dropped Java and JHOVE in favour of qpdf
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
##############################################################################
|
##############################################################################
|
||||||
from reportlab.pdfgen.canvas import Canvas
|
from reportlab.pdfgen.canvas import Canvas
|
||||||
from reportlab.lib.units import inch
|
from reportlab.lib.units import inch
|
||||||
from lxml import etree as ElementTree
|
from xml.etree import ElementTree
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import re
|
import re
|
||||||
@ -35,8 +35,7 @@ class HocrTransform():
|
|||||||
self.dpi = dpi
|
self.dpi = dpi
|
||||||
self.boxPattern = re.compile(r'bbox((\s+\d+){4})')
|
self.boxPattern = re.compile(r'bbox((\s+\d+){4})')
|
||||||
|
|
||||||
self.hocr = ElementTree.ElementTree()
|
self.hocr = ElementTree.parse(hocrFileName)
|
||||||
self.hocr.parse(hocrFileName)
|
|
||||||
|
|
||||||
# if the hOCR file has a namespace, ElementTree requires its use to
|
# if the hOCR file has a namespace, ElementTree requires its use to
|
||||||
# find elements
|
# find elements
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user