pdfinfo: improve the regex

This commit is contained in:
James R. Barlow 2018-07-04 00:59:32 -07:00
parent 8b0496d35e
commit 216d60ea2c
2 changed files with 31 additions and 1 deletions

View File

@ -36,7 +36,16 @@ Encoding = Enum('Encoding',
'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \
'runlength')
regex_remove_char_tags = re.compile(br"<char[^\/]+\/>")
# Forgive me for I have sinned
# I am using regular expressions to parse XML. However the XML in this case,
# generated by Ghostscript, is self-consistent enough to be parseable.
regex_remove_char_tags = re.compile(br"""
<char\b
(?: [^>] # anything single character but >
| \">\" # special case: trap ">"
)*
/> # terminate with '/>'
""", re.VERBOSE)
FRIENDLY_COLORSPACE = {
'/DeviceGray': Colorspace.gray,

View File

@ -148,3 +148,24 @@ def test_pickle(resources):
filename = resources / 'formxobject.pdf'
pdf = pdfinfo.PdfInfo(filename)
pickle.dumps(pdf)
def test_regex():
rx = pdfinfo.regex_remove_char_tags
must_match = [
b'<char bbox="0 108 0 108" c="/"/>',
b'<char bbox="0 108 0 108" c=">"/>',
b'<char bbox="0 108 0 108" c="X"/>',
]
must_not_match = [
b'<span stuff="c">',
b'<span>',
b'</span>',
b'</page>'
]
for s in must_match:
assert rx.match(s)
for s in must_not_match:
assert not rx.match(s)