diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py
index d46f2175..4e36207c 100644
--- a/src/ocrmypdf/pdfinfo.py
+++ b/src/ocrmypdf/pdfinfo.py
@@ -36,7 +36,16 @@ Encoding = Enum('Encoding',
'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \
'runlength')
-regex_remove_char_tags = re.compile(br"")
+# Forgive me for I have sinned
+# I am using regular expressions to parse XML. However the XML in this case,
+# generated by Ghostscript, is self-consistent enough to be parseable.
+regex_remove_char_tags = re.compile(br"""
+ ] # anything single character but >
+ | \">\" # special case: trap ">"
+ )*
+ /> # terminate with '/>'
+""", re.VERBOSE)
FRIENDLY_COLORSPACE = {
'/DeviceGray': Colorspace.gray,
diff --git a/tests/test_pageinfo.py b/tests/test_pdfinfo.py
similarity index 90%
rename from tests/test_pageinfo.py
rename to tests/test_pdfinfo.py
index c5e2386b..ca4bcc43 100644
--- a/tests/test_pageinfo.py
+++ b/tests/test_pdfinfo.py
@@ -148,3 +148,24 @@ def test_pickle(resources):
filename = resources / 'formxobject.pdf'
pdf = pdfinfo.PdfInfo(filename)
pickle.dumps(pdf)
+
+
+def test_regex():
+ rx = pdfinfo.regex_remove_char_tags
+
+ must_match = [
+ b'',
+ b'',
+ b'',
+ ]
+ must_not_match = [
+ b'',
+ b'',
+ b'',
+ b''
+ ]
+
+ for s in must_match:
+ assert rx.match(s)
+ for s in must_not_match:
+ assert not rx.match(s)