From 216d60ea2c2f227a6772fe5eedbbbd8d761cb103 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 4 Jul 2018 00:59:32 -0700 Subject: [PATCH] pdfinfo: improve the regex --- src/ocrmypdf/pdfinfo.py | 11 ++++++++++- tests/{test_pageinfo.py => test_pdfinfo.py} | 21 +++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) rename tests/{test_pageinfo.py => test_pdfinfo.py} (90%) diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py index d46f2175..4e36207c 100644 --- a/src/ocrmypdf/pdfinfo.py +++ b/src/ocrmypdf/pdfinfo.py @@ -36,7 +36,16 @@ Encoding = Enum('Encoding', 'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \ 'runlength') -regex_remove_char_tags = re.compile(br"") +# Forgive me for I have sinned +# I am using regular expressions to parse XML. However the XML in this case, +# generated by Ghostscript, is self-consistent enough to be parseable. +regex_remove_char_tags = re.compile(br""" + ] # anything single character but > + | \">\" # special case: trap ">" + )* + /> # terminate with '/>' +""", re.VERBOSE) FRIENDLY_COLORSPACE = { '/DeviceGray': Colorspace.gray, diff --git a/tests/test_pageinfo.py b/tests/test_pdfinfo.py similarity index 90% rename from tests/test_pageinfo.py rename to tests/test_pdfinfo.py index c5e2386b..ca4bcc43 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pdfinfo.py @@ -148,3 +148,24 @@ def test_pickle(resources): filename = resources / 'formxobject.pdf' pdf = pdfinfo.PdfInfo(filename) pickle.dumps(pdf) + + +def test_regex(): + rx = pdfinfo.regex_remove_char_tags + + must_match = [ + b'', + b'', + b'', + ] + must_not_match = [ + b'', + b'', + b'', + b'' + ] + + for s in must_match: + assert rx.match(s) + for s in must_not_match: + assert not rx.match(s)