From 6d3a7d634ee26f110a21e6fa98bf8115f1cb9996 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 2 Apr 2025 21:14:14 +0000
Subject: [PATCH 1/3] Adding autorender if katex into synthetic pipeline

---
 olmocr/bench/katex/auto-render.min.js     |  1 +
 olmocr/bench/synth/mine_html_templates.py | 61 ++++++++++++++++++++---
 2 files changed, 55 insertions(+), 7 deletions(-)
 create mode 100644 olmocr/bench/katex/auto-render.min.js

diff --git a/olmocr/bench/katex/auto-render.min.js b/olmocr/bench/katex/auto-render.min.js
new file mode 100644
index 0000000..32a7dd8
--- /dev/null
+++ b/olmocr/bench/katex/auto-render.min.js
@@ -0,0 +1 @@
+!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("katex")):"function"==typeof define&&define.amd?define(["katex"],t):"object"==typeof exports?exports.renderMathInElement=t(require("katex")):e.renderMathInElement=t(e.katex)}("undefined"!=typeof self?self:this,(function(e){return function(){"use strict";var t={757:function(t){t.exports=e}},n={};function r(e){var o=n[e];if(void 0!==o)return o.exports;var i=n[e]={exports:{}};return t[e](i,i.exports,r),i.exports}r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,{a:t}),t},r.d=function(e,t){for(var n in t)r.o(t,n)&&!r.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)};var o={};r.d(o,{default:function(){return p}});var i=r(757),a=r.n(i);const l=function(e,t,n){let r=n,o=0;const i=e.length;for(;r<t.length;){const n=t[r];if(o<=0&&t.slice(r,r+i)===e)return r;"\\"===n?r++:"{"===n?o++:"}"===n&&o--,r++}return-1},s=/^\\begin{/;var d=function(e,t){let n;const r=[],o=new RegExp("("+t.map((e=>e.left.replace(/[-/\\^$*+?.()|[\]{}]/g,"\\$&"))).join("|")+")");for(;n=e.search(o),-1!==n;){n>0&&(r.push({type:"text",data:e.slice(0,n)}),e=e.slice(n));const o=t.findIndex((t=>e.startsWith(t.left)));if(n=l(t[o].right,e,t[o].left.length),-1===n)break;const i=e.slice(0,n+t[o].right.length),a=s.test(i)?i:e.slice(t[o].left.length,n);r.push({type:"math",data:a,rawData:i,display:t[o].display}),e=e.slice(n+t[o].right.length)}return""!==e&&r.push({type:"text",data:e}),r};const c=function(e,t){const n=d(e,t.delimiters);if(1===n.length&&"text"===n[0].type)return null;const r=document.createDocumentFragment();for(let e=0;e<n.length;e++)if("text"===n[e].type)r.appendChild(document.createTextNode(n[e].data));else{const o=document.createElement("span");let i=n[e].data;t.displayMode=n[e].display;try{t.preProcess&&(i=t.preProcess(i)),a().render(i,o,t)}catch(o){if(!(o instanceof a().ParseError))throw o;t.errorCallback("KaTeX auto-render: Failed to parse `"+n[e].data+"` with ",o),r.appendChild(document.createTextNode(n[e].rawData));continue}r.appendChild(o)}return r},f=function(e,t){for(let n=0;n<e.childNodes.length;n++){const r=e.childNodes[n];if(3===r.nodeType){let o=r.textContent,i=r.nextSibling,a=0;for(;i&&i.nodeType===Node.TEXT_NODE;)o+=i.textContent,i=i.nextSibling,a++;const l=c(o,t);if(l){for(let e=0;e<a;e++)r.nextSibling.remove();n+=l.childNodes.length-1,e.replaceChild(l,r)}else n+=a}else if(1===r.nodeType){const e=" "+r.className+" ";-1===t.ignoredTags.indexOf(r.nodeName.toLowerCase())&&t.ignoredClasses.every((t=>-1===e.indexOf(" "+t+" ")))&&f(r,t)}}};var p=function(e,t){if(!e)throw new Error("No element provided to render");const n={};for(const e in t)t.hasOwnProperty(e)&&(n[e]=t[e]);n.delimiters=n.delimiters||[{left:"$$",right:"$$",display:!0},{left:"\\(",right:"\\)",display:!1},{left:"\\begin{equation}",right:"\\end{equation}",display:!0},{left:"\\begin{align}",right:"\\end{align}",display:!0},{left:"\\begin{alignat}",right:"\\end{alignat}",display:!0},{left:"\\begin{gather}",right:"\\end{gather}",display:!0},{left:"\\begin{CD}",right:"\\end{CD}",display:!0},{left:"\\[",right:"\\]",display:!0}],n.ignoredTags=n.ignoredTags||["script","noscript","style","textarea","pre","code","option"],n.ignoredClasses=n.ignoredClasses||[],n.errorCallback=n.errorCallback||console.error,n.macros=n.macros||{},f(e,n)};return o=o.default}()}));
\ No newline at end of file
diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index 3e0158a..f546246 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -206,6 +206,31 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
                 # Set the HTML content
                 await page.set_content(html_content)
 
+                # Add in katex and setup auto rendering
+                katex_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "katex")
+                katex_css_path = os.path.join(katex_dir, "katex.min.css")
+                katex_js_path = os.path.join(katex_dir, "katex.min.js")
+                katex_autorender_js_path = os.path.join(katex_dir, "auto-render.min.js")
+
+                await page.add_style_tag(path=katex_css_path)
+                await page.add_script_tag(path=katex_js_path)
+                await page.add_script_tag(path=katex_autorender_js_path)
+
+                await page.evaluate("""
+                    document.addEventListener("DOMContentLoaded", function() {
+                        renderMathInElement(document.body, {
+                        // customised options
+                        // • auto-render specific keys, e.g.:
+                        delimiters: [
+                            {left: '\\(', right: '\\)', display: true},
+                            {left: '\\[', right: '\\]', display: true}
+                        ],
+                        // • rendering keys, e.g.:
+                        throwOnError : false
+                        });
+                    });
+                                    """)
+
                 # Save as PDF with formatting options
                 await page.pdf(
                     path=output_pdf_path,
@@ -260,6 +285,28 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
 
     for div in soup.find_all("div", class_="page-footer"):
         div.name = "footer"
+        
+    # Remove elements in the body that appear before the header or after the footer
+    body = soup.find('body')
+    if body:
+        header = soup.find('header')
+        footer = soup.find('footer')
+        
+        if header:
+            # Remove elements before the header
+            current = body.contents[0]
+            while current and current != header:
+                next_elem = current.next_sibling
+                current.extract()
+                current = next_elem
+                
+        if footer:
+            # Remove elements after the footer
+            current = footer.next_sibling
+            while current:
+                next_elem = current.next_sibling
+                current.extract()
+                current = next_elem
 
     # Step 1: Process headers, footers, and page numbers for TextAbsenceTests
     headers = soup.find_all("header")
@@ -385,25 +432,25 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
             # Check cell up
             if row_idx > 0:
                 up_text = str(table_array[row_idx - 1, col_idx]).strip()
-                if up_text:
+                if up_text and "\n" not in up_text:
                     test_data["up"] = up_text
 
             # Check cell down
             if row_idx < table_array.shape[0] - 1:
                 down_text = str(table_array[row_idx + 1, col_idx]).strip()
-                if down_text:
+                if down_text and "\n" not in down_text:
                     test_data["down"] = down_text
 
             # Check cell left
             if col_idx > 0:
                 left_text = str(table_array[row_idx, col_idx - 1]).strip()
-                if left_text:
+                if left_text and "\n" not in left_text:
                     test_data["left"] = left_text
 
             # Check cell right
             if col_idx < table_array.shape[1] - 1:
                 right_text = str(table_array[row_idx, col_idx + 1]).strip()
-                if right_text:
+                if right_text and "\n" not in right_text:
                     test_data["right"] = right_text
 
             # Check for top heading using header information
@@ -413,7 +460,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 if col_headers:
                     # Use the first header as the top heading
                     _, top_heading = col_headers[0]
-                    if top_heading:
+                    if top_heading and "\n" not in top_heading:
                         test_data["top_heading"] = top_heading
 
             # Check for left heading using header information
@@ -423,7 +470,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 if row_headers:
                     # Use the first header as the left heading
                     _, left_heading = row_headers[0]
-                    if left_heading:
+                    if left_heading and "\n" not in left_heading:
                         test_data["left_heading"] = left_heading
 
             # Only add the test if we have at least one relation
@@ -529,7 +576,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     tests = [t for t in tests if t["type"] != "absent" or t["text"] not in full_text]
 
     # Remove any tests where the text has no alpha numeric characters
-    tests = [t for t in tests if "text" not in t or len([c for c in t["text"] if c.isalnum()])]
+    tests = [t for t in tests if "text" not in t or len([c for c in t["text"] if c.isalnum()])]    
 
     return tests
 

From 360b1be07cc6ff21143b2bc877c48b78a21d5995 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 2 Apr 2025 21:24:00 +0000
Subject: [PATCH 2/3] Better filtering of tests

---
 olmocr/bench/synth/mine_html_templates.py | 65 ++++++++++++++--
 olmocr/bench/synth/test_mine.py           | 94 +++++++++++++++++++++++
 2 files changed, 151 insertions(+), 8 deletions(-)

diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index f546246..ca537eb 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -453,8 +453,11 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 if right_text and "\n" not in right_text:
                     test_data["right"] = right_text
 
-            # Check for top heading using header information
-            if col_idx in table_data.col_headers:
+            # Check if current cell is a heading cell
+            is_header_cell = row_idx in table_data.header_rows or col_idx in table_data.header_cols
+            
+            # Check for top heading using header information (skip if current cell is a heading)
+            if not is_header_cell and col_idx in table_data.col_headers:
                 # Get the headers for this column
                 col_headers = table_data.col_headers[col_idx]
                 if col_headers:
@@ -463,8 +466,8 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                     if top_heading and "\n" not in top_heading:
                         test_data["top_heading"] = top_heading
 
-            # Check for left heading using header information
-            if row_idx in table_data.row_headers:
+            # Check for left heading using header information (skip if current cell is a heading)
+            if not is_header_cell and row_idx in table_data.row_headers:
                 # Get the headers for this row
                 row_headers = table_data.row_headers[row_idx]
                 if row_headers:
@@ -575,10 +578,56 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     # If they do, filter them out
     tests = [t for t in tests if t["type"] != "absent" or t["text"] not in full_text]
 
-    # Remove any tests where the text has no alpha numeric characters
-    tests = [t for t in tests if "text" not in t or len([c for c in t["text"] if c.isalnum()])]    
-
-    return tests
+    # Remove any tests where text-based fields have no alphanumeric characters or contain LaTeX
+    text_fields = ["text", "cell", "before", "after", "up", "down", "left", "right", "top_heading", "left_heading"]
+    
+    def contains_alphanumeric(value):
+        return any(c.isalnum() for c in value) if isinstance(value, str) else False
+    
+    def contains_latex(value):
+        if not isinstance(value, str):
+            return False
+        # Check for LaTeX delimiters
+        latex_patterns = [r'\(', r'\)', r'\[', r'\]']
+        return any(pattern in value for pattern in latex_patterns)
+    
+    filtered_tests = []
+    for test in tests:
+        # Check all text fields in the test for alphanumeric content and LaTeX
+        all_valid = True
+        for field in text_fields:
+            if field in test:
+                # Skip test if field has no alphanumeric characters
+                if not contains_alphanumeric(test[field]):
+                    all_valid = False
+                    break
+                # Skip test if field contains LaTeX delimiters
+                if contains_latex(test[field]):
+                    all_valid = False
+                    break
+        if all_valid:
+            filtered_tests.append(test)
+    
+    tests = filtered_tests
+    
+    # Remove duplicate tests (identical on everything but the id field)
+    unique_tests = []
+    test_signatures = set()
+    
+    for test in tests:
+        # Create a signature for the test by using all fields except 'id'
+        test_dict = test.copy()
+        test_id = test_dict.pop('id')
+        
+        # Convert dict to a sorted tuple of items for hashability
+        test_signature = tuple(sorted((k, str(v)) for k, v in test_dict.items()))
+        
+        # Only add the test if we haven't seen an identical one
+        if test_signature not in test_signatures:
+            test_signatures.add(test_signature)
+            unique_tests.append(test)
+    
+    return unique_tests
 
 
 def process_pdf(pdf_info, args, client):
diff --git a/olmocr/bench/synth/test_mine.py b/olmocr/bench/synth/test_mine.py
index 62dfff7..2e96f4e 100644
--- a/olmocr/bench/synth/test_mine.py
+++ b/olmocr/bench/synth/test_mine.py
@@ -757,5 +757,99 @@ class TestMineTests(unittest.TestCase):
 
         tests = generate_tests_from_html(html_content, "0", 1)
 
+        self.assertTrue(len(tests) > 10)
+
+    def test_sup(self):
+        html_content = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>A ROSE BY ANY OTHER NAME</title>
+    <style>
+        body {
+            font-family: Georgia, serif;
+            line-height: 1.5;
+            margin: 0 auto;
+            max-width: 666px;
+            padding: 20px;
+        }
+        header {
+            display: flex;
+            justify-content: space-between;
+            margin-bottom: 20px;
+        }
+        .page-number-left {
+            text-align: left;
+        }
+        .title {
+            text-align: center;
+            font-weight: bold;
+            flex-grow: 1;
+        }
+        .page-number-right {
+            text-align: right;
+        }
+        .section-heading {
+            text-align: center;
+            margin: 20px 0;
+        }
+        p {
+            text-indent: 2em;
+            margin: 0 0 10px 0;
+        }
+        .footnotes {
+            margin-top: 30px;
+            border-top: 1px solid #ccc;
+            padding-top: 10px;
+            font-size: 0.9em;
+        }
+        .footnote {
+            text-indent: -1.5em;
+            padding-left: 1.5em;
+            margin-bottom: 5px;
+        }
+        .italic {
+            font-style: italic;
+        }
+        sup {
+            font-size: 0.7em;
+            vertical-align: super;
+        }
+    </style>
+</head>
+<body>
+    <header>
+        <div class="page-number-left">2016]</div>
+        <div class="title">A ROSE BY ANY OTHER NAME</div>
+        <div class="page-number-right">1083</div>
+    </header>
+
+    <main>
+        <p>cases were decided within a year of each other (2000 and 2001, respectively). <span class="italic">Save the Manatee Club</span> largely consists of a truncated version of the <span class="italic">Consolidated-Tomoka</span> analysis, with minor adjustments to conform the opinion to the 1999 amendments. <span class="italic">Day Cruise</span>, on the other hand, closely analyzes the 1999 version of section 120.52(8). However, it is <span class="italic">Save the Manatee Club</span> that has come to dominate Florida court opinions on rulemaking challenges and not the more detailed <span class="italic">Day Cruise</span> analysis.<sup>78</sup> The following Sections will discuss the facts of the two cases, examine the differences between their analyses of section 120.52(8), and finally conclude with an opinion on which analysis is better to apply in section 120.52(8) rulemaking challenges.</p>
+
+        <h2 class="section-heading">A. Southwest Florida Water Management District v. Save the Manatee Club, Inc.</h2>
+
+        <p>After the legislature amended the APA, the First DCA analyzed the statutory language of section 120.52(8) again in <span class="italic">Southwest Florida Water Management District v. Save the Manatee Club, Inc.</span><sup>79</sup> <span class="italic">Save the Manatee Club</span> concerned the Southwest Florida Water Management District's (the "District's") authority to create exemptions to environmental resource permitting requirements.<sup>80</sup> South Shores Partners, Ltd. ("South Shores") applied "for a permit to develop a 720-acre tract of land in Southwest Hillsborough County."<sup>81</sup> As part of the development project, South Shores wanted "to build a connecting waterway between the [existing] canal system [on the property] and the [Tampa] Bay."<sup>82</sup> The Save the Manatee Club believed that the resulting increase in power boat traffic in this new waterway would "endanger the manatee and its habitat."<sup>83</sup></p>
+
+        <p>The District has the authority to grant either a general permit or an environmental resource permit to a development project, depending on the type of project involved.<sup>84</sup> When granting an environmental resource permit, the District must consider "[t]he impact a proposed development will have on wildlife" as a factor; it does not have to do so when it grants a general permit.<sup>85</sup> The District granted South</p>
+    </main>
+
+    <footer class="footnotes">
+        <div class="footnote">78. As of December 14, 2015, a search of the "Citing References" on WestLaw shows that <span class="italic">Save the Manatee Club</span> has been cited by forty court opinions. <span class="italic">Day Cruise</span>, by comparison, has been cited by fifteen court opinions. These numbers do not include citations to either case in DOAH decisions.</div>
+        <div class="footnote">79. 773 So. 2d 594 (Fla. 1st DCA 2000).</div>
+        <div class="footnote">80. <span class="italic">Id.</span> at 596.</div>
+        <div class="footnote">81. <span class="italic">Id.</span></div>
+        <div class="footnote">82. <span class="italic">Id.</span></div>
+        <div class="footnote">83. <span class="italic">Id.</span></div>
+        <div class="footnote">84. <span class="italic">See id.</span></div>
+        <div class="footnote">85. <span class="italic">Id.</span></div>
+    </footer>
+</body>
+</html>"""
+
+        tests = generate_tests_from_html(html_content, "0", 1)
+
         for test in tests:
             print(test)

From b8b780faca251f1b973f377b3f38b9fdc19f246c Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 2 Apr 2025 21:39:50 +0000
Subject: [PATCH 3/3] More mining of synthetic tests code

---
 olmocr/bench/synth/mine_html_templates.py | 166 ++++++++++++++++++----
 olmocr/bench/synth/test_mine.py           |  53 +++++++
 olmocr/bench/test_tests.py                |   4 +-
 3 files changed, 190 insertions(+), 33 deletions(-)

diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index ca537eb..cae8686 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -216,20 +216,21 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
                 await page.add_script_tag(path=katex_js_path)
                 await page.add_script_tag(path=katex_autorender_js_path)
 
-                await page.evaluate("""
-                    document.addEventListener("DOMContentLoaded", function() {
-                        renderMathInElement(document.body, {
+                # Run the KaTeX auto-renderer immediately rather than waiting for DOMContentLoaded
+                await page.evaluate(
+                    """
+                    renderMathInElement(document.body, {
                         // customised options
                         // • auto-render specific keys, e.g.:
                         delimiters: [
-                            {left: '\\(', right: '\\)', display: true},
-                            {left: '\\[', right: '\\]', display: true}
+                            {left: '\\\\(', right: '\\\\)', display: false},
+                            {left: '\\\\[', right: '\\\\]', display: true}
                         ],
                         // • rendering keys, e.g.:
-                        throwOnError : false
-                        });
+                        throwOnError: false
                     });
-                                    """)
+                """
+                )
 
                 # Save as PDF with formatting options
                 await page.pdf(
@@ -274,6 +275,79 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     Returns:
         A list of test dictionaries that can be saved as JSONL
     """
+
+    # Helper function to convert superscripts and subscripts to Unicode
+    def convert_superscripts_subscripts(element):
+        # Map for superscript characters
+        superscript_map = {
+            "0": "⁰",
+            "1": "¹",
+            "2": "²",
+            "3": "³",
+            "4": "⁴",
+            "5": "⁵",
+            "6": "⁶",
+            "7": "⁷",
+            "8": "⁸",
+            "9": "⁹",
+            "+": "⁺",
+            "-": "⁻",
+            "=": "⁼",
+            "(": "⁽",
+            ")": "⁾",
+            "n": "ⁿ",
+            "i": "ⁱ",
+        }
+
+        # Map for subscript characters
+        subscript_map = {
+            "0": "₀",
+            "1": "₁",
+            "2": "₂",
+            "3": "₃",
+            "4": "₄",
+            "5": "₅",
+            "6": "₆",
+            "7": "₇",
+            "8": "₈",
+            "9": "₉",
+            "+": "₊",
+            "-": "₋",
+            "=": "₌",
+            "(": "₍",
+            ")": "₎",
+            "a": "ₐ",
+            "e": "ₑ",
+            "o": "ₒ",
+            "x": "ₓ",
+            "h": "ₕ",
+            "k": "ₖ",
+            "l": "ₗ",
+            "m": "ₘ",
+            "n": "ₙ",
+            "p": "ₚ",
+            "s": "ₛ",
+            "t": "ₜ",
+        }
+
+        # Process all superscript tags
+        for sup in element.find_all("sup"):
+            sup_text = sup.get_text()
+            unicode_text = ""
+            for char in sup_text:
+                unicode_text += superscript_map.get(char, char)
+            sup.replace_with(unicode_text)
+
+        # Process all subscript tags
+        for sub in element.find_all("sub"):
+            sub_text = sub.get_text()
+            unicode_text = ""
+            for char in sub_text:
+                unicode_text += subscript_map.get(char, char)
+            sub.replace_with(unicode_text)
+
+        return element
+
     tests = []
     pdf_filename = f"{pdf_id}_page{page_num}.pdf"
     soup = BeautifulSoup(html_content, "html.parser")
@@ -285,13 +359,13 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
 
     for div in soup.find_all("div", class_="page-footer"):
         div.name = "footer"
-        
+
     # Remove elements in the body that appear before the header or after the footer
-    body = soup.find('body')
+    body = soup.find("body")
     if body:
-        header = soup.find('header')
-        footer = soup.find('footer')
-        
+        header = soup.find("header")
+        footer = soup.find("footer")
+
         if header:
             # Remove elements before the header
             current = body.contents[0]
@@ -299,7 +373,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 next_elem = current.next_sibling
                 current.extract()
                 current = next_elem
-                
+
         if footer:
             # Remove elements after the footer
             current = footer.next_sibling
@@ -317,6 +391,9 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     def create_absence_tests_from_elements(parent_element, element_type):
         mini_soup = BeautifulSoup(str(parent_element), "html.parser")
 
+        # Convert superscripts and subscripts in the mini soup
+        convert_superscripts_subscripts(mini_soup)
+
         # Remove headers, footers, and tables from the main_soup
         for element in mini_soup.find_all(["h1", "h2"]):
             element.extract()
@@ -370,7 +447,11 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
 
     # Create TextAbsenceTests for page numbers
     for page_number in page_numbers:
-        page_number_text = page_number.get_text().strip()
+        # Convert any superscripts/subscripts in the page number
+        page_number_soup = BeautifulSoup(str(page_number), "html.parser")
+        convert_superscripts_subscripts(page_number_soup)
+        page_number_text = page_number_soup.get_text().strip()
+
         if page_number_text:
             tests.append(
                 {
@@ -384,7 +465,14 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
             )
 
     # Step 2: Generate tests from tables using parse_html_tables
-    table_data_list = parse_html_tables(html_content)
+    # Convert superscripts and subscripts to Unicode equivalents in tables
+    table_soup = BeautifulSoup(html_content, "html.parser")
+
+    # Convert superscripts and subscripts in the table HTML
+    convert_superscripts_subscripts(table_soup)
+    html_content_with_unicode = str(table_soup)
+
+    table_data_list = parse_html_tables(html_content_with_unicode)
 
     for table_idx, table_data in enumerate(table_data_list):
         # Get the table data as a numpy array
@@ -455,7 +543,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
 
             # Check if current cell is a heading cell
             is_header_cell = row_idx in table_data.header_rows or col_idx in table_data.header_cols
-            
+
             # Check for top heading using header information (skip if current cell is a heading)
             if not is_header_cell and col_idx in table_data.col_headers:
                 # Get the headers for this column
@@ -505,7 +593,6 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 else:
                     # Shouldn't happen, but handle it gracefully
                     passed = False
-                    explanation = f"Table index {table_idx} out of range, only {len(tables)} tables found"
 
                 # Only add tests that pass
                 if passed:
@@ -522,6 +609,9 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     for element in main_soup.find_all(["header", "footer", "table", "head"]):
         element.extract()
 
+    # Convert superscripts and subscripts in the main soup
+    convert_superscripts_subscripts(main_soup)
+
     full_text = main_soup.get_text().strip()
 
     sentences = []
@@ -578,22 +668,32 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     # If they do, filter them out
     tests = [t for t in tests if t["type"] != "absent" or t["text"] not in full_text]
 
-    # Remove any tests where text-based fields have no alphanumeric characters or contain LaTeX
+    # Remove any tests where text-based fields have no alphanumeric characters, contain LaTeX, or contain Unicode super/subscripts
     text_fields = ["text", "cell", "before", "after", "up", "down", "left", "right", "top_heading", "left_heading"]
-    
+
     def contains_alphanumeric(value):
         return any(c.isalnum() for c in value) if isinstance(value, str) else False
-    
+
     def contains_latex(value):
         if not isinstance(value, str):
             return False
         # Check for LaTeX delimiters
-        latex_patterns = [r'\(', r'\)', r'\[', r'\]']
+        latex_patterns = [r"\(", r"\)", r"\[", r"\]"]
         return any(pattern in value for pattern in latex_patterns)
-    
+
+    def contains_unicode_super_or_subscripts(value):
+        if not isinstance(value, str):
+            return False
+
+        # Unicode ranges for superscripts and subscripts
+        superscript_chars = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿⁱ"
+        subscript_chars = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜ"
+
+        return any(c in superscript_chars or c in subscript_chars for c in value)
+
     filtered_tests = []
     for test in tests:
-        # Check all text fields in the test for alphanumeric content and LaTeX
+        # Check all text fields in the test for alphanumeric content, LaTeX, and Unicode super/subscripts
         all_valid = True
         for field in text_fields:
             if field in test:
@@ -605,28 +705,32 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 if contains_latex(test[field]):
                     all_valid = False
                     break
+                # Skip test if field contains Unicode super or subscripts
+                if contains_unicode_super_or_subscripts(test[field]):
+                    all_valid = False
+                    break
         if all_valid:
             filtered_tests.append(test)
-    
+
     tests = filtered_tests
-    
+
     # Remove duplicate tests (identical on everything but the id field)
     unique_tests = []
     test_signatures = set()
-    
+
     for test in tests:
         # Create a signature for the test by using all fields except 'id'
         test_dict = test.copy()
-        test_id = test_dict.pop('id')
-        
+        test_dict.pop("id")
+
         # Convert dict to a sorted tuple of items for hashability
         test_signature = tuple(sorted((k, str(v)) for k, v in test_dict.items()))
-        
+
         # Only add the test if we haven't seen an identical one
         if test_signature not in test_signatures:
             test_signatures.add(test_signature)
             unique_tests.append(test)
-    
+
     return unique_tests
 
 
diff --git a/olmocr/bench/synth/test_mine.py b/olmocr/bench/synth/test_mine.py
index 2e96f4e..1b30d77 100644
--- a/olmocr/bench/synth/test_mine.py
+++ b/olmocr/bench/synth/test_mine.py
@@ -853,3 +853,56 @@ class TestMineTests(unittest.TestCase):
 
         for test in tests:
             print(test)
+
+    def test_katex_autorender(self):
+        """Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function."""
+        import asyncio
+        import os
+        import tempfile
+
+        from ..synth.mine_html_templates import render_pdf_with_playwright
+
+        # Create HTML with LaTeX expressions
+        html_content = """
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <meta charset="UTF-8">
+            <title>KaTeX Auto-Render Test</title>
+        </head>
+        <body>
+            <h1>Math Expressions Test</h1>
+            
+            <p>Inline math expression: \(E = mc^2\)</p>
+            
+            <p>Block math expression:</p>
+            <p>\[
+            \\frac{d}{dx}(x^n) = nx^{n-1}
+            \]</p>
+            
+            <p>Another complex equation:</p>
+            <p>\[
+            \int_{a}^{b} f(x) \, dx = F(b) - F(a)
+            \]</p>
+        </body>
+        </html>
+        """
+
+        # Create a temporary file to store the rendered PDF
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+            output_pdf_path = tmp_file.name
+
+        # Render the HTML to PDF
+        render_success = asyncio.run(render_pdf_with_playwright(html_content=html_content, output_pdf_path=output_pdf_path, png_width=800, png_height=600))
+
+        # Check if rendering was successful
+        self.assertTrue(render_success, "PDF rendering should succeed")
+
+        # Verify PDF was created and has content
+        self.assertTrue(os.path.exists(output_pdf_path), "PDF file should exist")
+        self.assertTrue(os.path.getsize(output_pdf_path) > 0, "PDF file should have content")
+
+        # The actual validation of KaTeX rendering would require visual inspection or text extraction,
+        # but at minimum we can verify the file was created successfully
+
+        print(output_pdf_path)
diff --git a/olmocr/bench/test_tests.py b/olmocr/bench/test_tests.py
index f53b97b..d5f89e3 100644
--- a/olmocr/bench/test_tests.py
+++ b/olmocr/bench/test_tests.py
@@ -322,7 +322,7 @@ class TestTableTest(unittest.TestCase):
 
     def test_parse_markdown_tables(self):
         """Test markdown table parsing"""
-        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
+        _test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
         tables = parse_markdown_tables(self.markdown_table)
         self.assertEqual(len(tables), 1)
         self.assertEqual(tables[0].data.shape, (3, 3))  # 3 rows, 3 columns
@@ -332,7 +332,7 @@ class TestTableTest(unittest.TestCase):
 
     def test_parse_html_tables(self):
         """Test HTML table parsing"""
-        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
+        _test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Cell A2")
         tables = parse_html_tables(self.html_table)
         self.assertEqual(len(tables), 1)
         self.assertEqual(tables[0].data.shape, (3, 3))  # 3 rows, 3 columns