diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index e27a55b..798de33 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -144,7 +144,7 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
         bool: True if rendering was successful with exactly one page, False otherwise
     """
     scale_factors = [1.0, 0.9, 0.8, 0.7]  # Try these scale factors in order
-    
+
     for scale in scale_factors:
         try:
             async with async_playwright() as p:
@@ -162,7 +162,7 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
                 )
 
                 await browser.close()
-                
+
                 # Check if the output PDF has exactly one page
                 try:
                     reader = pypdf.PdfReader(output_pdf_path)
@@ -175,11 +175,11 @@ async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, p
                 except Exception as pdf_check_error:
                     print(f"Error checking PDF page count: {pdf_check_error}")
                     return False
-                    
+
         except Exception as e:
             print(f"Error rendering PDF with Playwright at scale {scale}: {str(e)}")
             # Try the next scale factor
-    
+
     print("Failed to render PDF as a single page with any scale factor")
     return False
 
@@ -209,35 +209,37 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
     def create_absence_tests_from_elements(parent_element, element_type):
         # Find all text-containing elements within the parent
         text_elements = []
-        
+
         # First get direct text nodes within spans, divs, p, and heading tags
-        for tag in parent_element.find_all(['span', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+        for tag in parent_element.find_all(["span", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]):
             text = tag.get_text().strip()
             if text:
                 text_elements.append(text)
-        
+
         # If no elements found, use the parent's text as a fallback
         if not text_elements:
             parent_text = parent_element.get_text().strip()
             if parent_text:
                 text_elements.append(parent_text)
-        
+
         # Create tests for each text element
         for text in text_elements:
             if len(text) > 3:  # Only create tests for meaningful text
-                tests.append({
-                    "pdf": pdf_filename,
-                    "page": page_num,
-                    "id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}",
-                    "type": TestType.ABSENT.value,
-                    "text": text,
-                    "max_diffs": 5
-                })
-    
+                tests.append(
+                    {
+                        "pdf": pdf_filename,
+                        "page": page_num,
+                        "id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}",
+                        "type": TestType.ABSENT.value,
+                        "text": text,
+                        "max_diffs": 5,
+                    }
+                )
+
     # Create TextAbsenceTests for headers
     for header in headers:
         create_absence_tests_from_elements(header, "header")
-    
+
     # Create TextAbsenceTests for footers
     for footer in footers:
         create_absence_tests_from_elements(footer, "footer")
@@ -377,7 +379,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
     if len(paragraphs) >= 2:
         # Extract all text from the main content
         all_text = " ".join([p.get_text().strip() for p in paragraphs])
-        
+
         # Use syntok to segment the text into sentences
         sentences = []
         for paragraph in process(all_text):
@@ -386,7 +388,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
                 sentence_text = " ".join([token.value for token in sentence]).strip()
                 if sentence_text and len(sentence_text) > 10 and len(sentence_text) < 100:
                     sentences.append(sentence_text)
-        
+
         # Create TextOrderTests from pairs of sentences that are at least 3 sentences apart
         # to ensure they're from different parts of the document
         if len(sentences) >= 5:
@@ -395,19 +397,21 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
                 # Get two random indices with sufficient distance between them
                 i = random.randint(0, len(sentences) - 4)
                 j = random.randint(i + 3, min(i + 10, len(sentences) - 1))
-                
+
                 first_sentence = sentences[i]
                 second_sentence = sentences[j]
-                
-                tests.append({
-                    "pdf": pdf_filename,
-                    "page": page_num,
-                    "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
-                    "type": TestType.ORDER.value,
-                    "before": first_sentence,
-                    "after": second_sentence,
-                    "max_diffs": 10,
-                })
+
+                tests.append(
+                    {
+                        "pdf": pdf_filename,
+                        "page": page_num,
+                        "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
+                        "type": TestType.ORDER.value,
+                        "before": first_sentence,
+                        "after": second_sentence,
+                        "max_diffs": 10,
+                    }
+                )
 
     return tests
 
@@ -487,7 +491,7 @@ def process_pdf(pdf_info, args, client):
 
                 # Run the async function in the synchronous context
                 render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
-                
+
                 if render_success:
                     print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
                 else:
@@ -501,7 +505,7 @@ def process_pdf(pdf_info, args, client):
                 print(f"Failed to render with Playwright: {e}")
                 playwright_pdf_path = None
                 render_success = False
-                
+
         # If playwright rendering failed and was required, return None to skip this test
         if not args.skip_playwright and not render_success:
             return None
diff --git a/scripts/scan_dolmadocs.py b/scripts/scan_dolmadocs.py
index 0bc9e62..af00a42 100644
--- a/scripts/scan_dolmadocs.py
+++ b/scripts/scan_dolmadocs.py
@@ -79,8 +79,8 @@ def list_result_files(s3_client, workspace_path):
         if "Contents" in page:
             all_files.extend([f"s3://{bucket}/{obj['Key']}" for obj in page["Contents"] if obj["Key"].endswith(".jsonl") or obj["Key"].endswith(".json")])
 
-        if len(all_files) > 1000:
-            break
+        # if len(all_files) > 1000:
+        #     break
 
     return all_files
 
@@ -243,14 +243,13 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
             }}
             
             .info-item h3 {{
-                font-size: 0.875rem;
+                font-size: 0.6rem;
                 color: var(--text-light);
                 margin-bottom: 0.25rem;
             }}
             
             .info-item p {{
-                font-size: 1rem;
-                font-weight: 500;
+                font-size: 0.6rem;
             }}
             
             .page-grid {{
@@ -317,16 +316,46 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
                 text-decoration: underline;
             }}
             
+            /* New button group styling for connected Yes/No buttons */
+            .btn-group {{
+                display: inline-flex;
+                margin-bottom: 0.5rem;
+            }}
+            
+            .btn-group .toggle-button {{
+                padding: 0.5rem 1rem;
+                border: 1px solid var(--border-color);
+                background-color: #f8fafc;
+                cursor: pointer;
+                margin: 0;
+                /* Remove individual border radius so we can set unified ones */
+                border-radius: 0;
+            }}
+            
+            .btn-group .toggle-button:first-child {{
+                border-right: none;
+                border-top-left-radius: 0.25rem;
+                border-bottom-left-radius: 0.25rem;
+            }}
+            
+            .btn-group .toggle-button:last-child {{
+                border-top-right-radius: 0.25rem;
+                border-bottom-right-radius: 0.25rem;
+            }}
+            
             .feedback {{
                 margin-top: 0.5rem;
                 padding: 0.5rem;
                 border-top: 1px solid var(--border-color);
             }}
             
-            .feedback label {{
-                margin-right: 1rem;
-                font-size: 0.875rem;
-                color: var(--text-light);
+            .feedback .toggle-group {{
+                margin-bottom: 0.5rem;
+            }}
+            
+            .toggle-button.active {{
+                background-color: var(--primary-color);
+                color: white;
             }}
             
             .feedback textarea {{
@@ -370,7 +399,43 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
         <div class="container">
             <header>
                 <h1>OLMOCR Random Samples</h1>
-                <p>A visual survey of randomly selected pages from processed documents</p>
+                <div style="display: flex; font-family: Arial, sans-serif; font-size: 14px; max-width: 1000px; margin: 0 auto;">
+                <div style="flex: 1; padding: 15px; background-color: #f5f7f9; border-radius: 8px; margin-right: 10px;">
+                    <h3 style="color: #2c3e50; margin-top: 0; border-bottom: 1px solid #ddd; padding-bottom: 8px;">PII Direct Identifiers</h3>
+                    <p style="font-style: italic; color: #555; margin-bottom: 15px;">Information that identifies a data subject without further context</p>
+                    <ul style="padding-left: 20px; line-height: 1.5; margin-top: 0;">
+                    <li>Names: Full names, first names, last names, nicknames, maiden names, birth names, aliases</li>
+                    <li>Addresses: Street addresses, postal codes, city, state, country</li>
+                    <li>Contact Information: Phone numbers, email addresses</li>
+                    <li>Government IDs: Social Security Numbers (SSNs), passport numbers, driver's license numbers, tax identification numbers</li>
+                    <li>Financial Information: Credit card numbers, bank account numbers, routing numbers</li>
+                    <li>Biometric Data: Fingerprints, retina scans, voice signatures, facial recognition data</li>
+                    <li>Date of Birth of data subject</li>
+                    <li>Place of Birth of data subject</li>
+                    <li>Gender of data subject</li>
+                    <li>Race of data subject</li>
+                    <li>Religion of data subject</li>
+                    </ul>
+                </div>
+                
+                <div style="flex: 1; padding: 15px; background-color: #f5f7f9; border-radius: 8px; margin-left: 10px;">
+                    <h3 style="color: #2c3e50; margin-top: 0; border-bottom: 1px solid #ddd; padding-bottom: 8px;">PII Indirect Identifiers</h3>
+                    <p style="font-style: italic; color: #555; margin-bottom: 15px;">Information that can be used to identify a data subject in context or in combination with other information</p>
+                    <ul style="padding-left: 20px; line-height: 1.5; margin-top: 0;">
+                    <li>IP Addresses</li>
+                    <li>Login IDs</li>
+                    <li>Geolocations</li>
+                    <li>Employment Information</li>
+                    <li>Education Information</li>
+                    <li>Medical Information</li>
+                    <li>Usernames</li>
+                    <li>Passwords</li>
+                    <li>Keys</li>
+                    <li>URLs</li>
+                    <li>Company Names</li>
+                    </ul>
+                </div>
+                </div>
             </header>
             
             <div class="info-bar">
@@ -415,7 +480,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
             # Render PDF to base64 webp
             base64_image = render_pdf_to_base64webp(temp_file_path, page_num, resolution)
 
-            # Add to HTML with feedback checkboxes and textarea.
+            # Add to HTML with the connected Yes/No button group.
             html_content += f"""
             <div class="page-container">
                 <div class="page-info">
@@ -423,14 +488,13 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
                     <p>Page {page_num}</p>
                     <p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
                     <div class="feedback" data-id="page-{i}">
-                        <label>
-                            <input type="checkbox" data-type="personal_info" onchange="saveFeedback(this)" />
-                            Personal information
-                        </label>
-                        <label>
-                            <input type="checkbox" data-type="cannot_read" onchange="saveFeedback(this)" />
-                            I cannot read this
-                        </label>
+                        <span class="btn-group">
+                            <button type="button" class="toggle-button personal-info" data-value="yes" onclick="togglePersonalInfo(this)">Yes PII</button>
+                            <button type="button" class="toggle-button personal-info" data-value="no" onclick="togglePersonalInfo(this)">No PII</button>
+                        </span>
+                        <span class="btn-group">
+                            <button type="button" class="toggle-button cannot-read" onclick="toggleCannotRead(this)">I cannot read this</button>
+                        </span>
                         <textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)"></textarea>
                     </div>
                 </div>
@@ -451,14 +515,13 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
                     <p>Page {page_num}</p>
                     <p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
                     <div class="feedback" data-id="page-{i}">
-                        <label>
-                            <input type="checkbox" data-type="personal_info" onchange="saveFeedback(this)" />
-                            Personal information
-                        </label>
-                        <label>
-                            <input type="checkbox" data-type="cannot_read" onchange="saveFeedback(this)" />
-                            I cannot read this
-                        </label>
+                        <span class="btn-group">
+                            <button type="button" class="toggle-button personal-info" data-value="yes" onclick="togglePersonalInfo(this)">Yes PII</button>
+                            <button type="button" class="toggle-button personal-info" data-value="no" onclick="togglePersonalInfo(this)">No PII</button>
+                        </span>
+                        <span class="toggle-group">
+                            <button type="button" class="toggle-button cannot-read" onclick="toggleCannotRead(this)">I cannot read this</button>
+                        </span>
                         <textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)"></textarea>
                     </div>
                 </div>
@@ -475,43 +538,68 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
         <script>
             // Using externally injected async functions: fetchDatastore() and putDatastore()
 
-            async function saveFeedback(el) {
-                const feedbackDiv = el.closest('.feedback');
+            async function saveFeedback(source) {
+                const feedbackDiv = source.classList.contains('feedback') ? source : source.closest('.feedback');
                 const id = feedbackDiv.getAttribute('data-id');
-                const personalInfo = feedbackDiv.querySelector('input[data-type="personal_info"]').checked;
-                const cannotRead = feedbackDiv.querySelector('input[data-type="cannot_read"]').checked;
+                // Get the personal info state from the active toggle button
+                const personalButton = feedbackDiv.querySelector('button.personal-info.active');
+                const personalInfo = personalButton ? personalButton.getAttribute('data-value') : null;
+                // Get the state of the "I cannot read this" toggle
+                const cannotReadButton = feedbackDiv.querySelector('button.cannot-read');
+                const cannotRead = cannotReadButton ? cannotReadButton.classList.contains('active') : false;
                 const piiDescription = feedbackDiv.querySelector('textarea').value;
 
-                // Retrieve the current datastore (or initialize as empty object)
                 const datastore = await fetchDatastore() || {};
-
-                // Update the datastore for this feedback div
                 datastore[id] = {
                     personalInfo: personalInfo,
                     cannotRead: cannotRead,
                     piiDescription: piiDescription
                 };
 
-                // Save the updated datastore back to S3
                 await putDatastore(datastore);
             }
 
+            function togglePersonalInfo(btn) {
+                const feedbackDiv = btn.closest('.feedback');
+                // Remove active class from all personal info buttons in this group
+                feedbackDiv.querySelectorAll('button.personal-info').forEach(function(b) {
+                    b.classList.remove('active');
+                });
+                // Toggle on the clicked button
+                btn.classList.add('active');
+                saveFeedback(feedbackDiv);
+            }
+
+            function toggleCannotRead(btn) {
+                btn.classList.toggle('active');
+                const feedbackDiv = btn.closest('.feedback');
+                saveFeedback(feedbackDiv);
+            }
+
             document.addEventListener("DOMContentLoaded", async function() {
-                // Fetch the entire datastore on page load
                 const datastore = await fetchDatastore() || {};
 
-                // Populate each feedback div based on the saved datastore
                 document.querySelectorAll('.feedback').forEach(function(feedbackDiv) {
                     const id = feedbackDiv.getAttribute('data-id');
                     if (datastore[id]) {
                         const data = datastore[id];
-                        const personalCheckbox = feedbackDiv.querySelector('input[data-type="personal_info"]');
-                        const cannotReadCheckbox = feedbackDiv.querySelector('input[data-type="cannot_read"]');
-                        const textarea = feedbackDiv.querySelector('textarea');
-
-                        personalCheckbox.checked = data.personalInfo;
-                        cannotReadCheckbox.checked = data.cannotRead;
-                        textarea.value = data.piiDescription;
+                        // Set active state for personal info toggle buttons
+                        feedbackDiv.querySelectorAll('button.personal-info').forEach(function(btn) {
+                            if (btn.getAttribute('data-value') === data.personalInfo) {
+                                btn.classList.add('active');
+                            } else {
+                                btn.classList.remove('active');
+                            }
+                        });
+                        // Set active state for "I cannot read this"
+                        const cannotReadButton = feedbackDiv.querySelector('button.cannot-read');
+                        if (data.cannotRead) {
+                            cannotReadButton.classList.add('active');
+                        } else {
+                            cannotReadButton.classList.remove('active');
+                        }
+                        // Set the textarea value
+                        feedbackDiv.querySelector('textarea').value = data.piiDescription;
                     }
                 });
             });