mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-05 11:41:19 +00:00
pdfinfo: improve the regex
This commit is contained in:
parent
8b0496d35e
commit
216d60ea2c
@ -36,7 +36,16 @@ Encoding = Enum('Encoding',
|
||||
'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \
|
||||
'runlength')
|
||||
|
||||
regex_remove_char_tags = re.compile(br"<char[^\/]+\/>")
|
||||
# Forgive me for I have sinned
|
||||
# I am using regular expressions to parse XML. However the XML in this case,
|
||||
# generated by Ghostscript, is self-consistent enough to be parseable.
|
||||
regex_remove_char_tags = re.compile(br"""
|
||||
<char\b
|
||||
(?: [^>] # anything single character but >
|
||||
| \">\" # special case: trap ">"
|
||||
)*
|
||||
/> # terminate with '/>'
|
||||
""", re.VERBOSE)
|
||||
|
||||
FRIENDLY_COLORSPACE = {
|
||||
'/DeviceGray': Colorspace.gray,
|
||||
|
||||
@ -148,3 +148,24 @@ def test_pickle(resources):
|
||||
filename = resources / 'formxobject.pdf'
|
||||
pdf = pdfinfo.PdfInfo(filename)
|
||||
pickle.dumps(pdf)
|
||||
|
||||
|
||||
def test_regex():
|
||||
rx = pdfinfo.regex_remove_char_tags
|
||||
|
||||
must_match = [
|
||||
b'<char bbox="0 108 0 108" c="/"/>',
|
||||
b'<char bbox="0 108 0 108" c=">"/>',
|
||||
b'<char bbox="0 108 0 108" c="X"/>',
|
||||
]
|
||||
must_not_match = [
|
||||
b'<span stuff="c">',
|
||||
b'<span>',
|
||||
b'</span>',
|
||||
b'</page>'
|
||||
]
|
||||
|
||||
for s in must_match:
|
||||
assert rx.match(s)
|
||||
for s in must_not_match:
|
||||
assert not rx.match(s)
|
||||
Loading…
x
Reference in New Issue
Block a user