pdfinfo: improve the regex

2026-01-05 11:41:19 +00:00 · 2018-07-04 00:59:32 -07:00 · 2018-07-04 00:59:32 -07:00 · 216d60ea2c
commit 216d60ea2c
parent 8b0496d35e
2 changed files with 31 additions and 1 deletions
--- a/src/ocrmypdf/pdfinfo.py
+++ b/src/ocrmypdf/pdfinfo.py
@ -36,7 +36,16 @@ Encoding = Enum('Encoding',
                'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \
                'runlength')

-regex_remove_char_tags = re.compile(br"<char[^\/]+\/>")
+# Forgive me for I have sinned
+# I am using regular expressions to parse XML. However the XML in this case,
+# generated by Ghostscript, is self-consistent enough to be parseable.
+regex_remove_char_tags = re.compile(br"""
+    <char\b
+    (?:   [^>]   # anything single character but >
+        | \">\"  # special case: trap ">"
+    )*
+    />           # terminate with '/>'
+""", re.VERBOSE)

 FRIENDLY_COLORSPACE = {
    '/DeviceGray': Colorspace.gray,
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@ -148,3 +148,24 @@ def test_pickle(resources):
    filename = resources / 'formxobject.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    pickle.dumps(pdf)
+
+
+def test_regex():
+    rx = pdfinfo.regex_remove_char_tags
+
+    must_match = [
+        b'<char bbox="0 108 0 108" c="/"/>',
+        b'<char bbox="0 108 0 108" c=">"/>',
+        b'<char bbox="0 108 0 108" c="X"/>',
+    ]
+    must_not_match = [
+        b'<span stuff="c">',
+        b'<span>',
+        b'</span>',
+        b'</page>'
+    ]
+
+    for s in must_match:
+        assert rx.match(s)
+    for s in must_not_match:
+        assert not rx.match(s)