Organizing things for data entry

2025-07-31 21:13:57 +00:00 · 2025-02-28 14:58:29 -08:00 · 2025-02-28 14:58:29 -08:00 · 9f12917e10
commit 9f12917e10
parent af02c63531
3 changed files with 232 additions and 135 deletions
--- a/olmocr/bench/miners/mine_diffs.py
+++ b/olmocr/bench/miners/mine_diffs.py
@ -1,5 +1,6 @@
 import os
 import re
 import time
 import argparse
 from difflib import SequenceMatcher
 from collections import Counter
@ -8,7 +9,6 @@ import syntok.segmenter as segmenter
 import syntok.tokenizer as tokenizer
 import base64
 import os
 from google import genai
 from google.genai import types
@ -18,6 +18,8 @@ from olmocr.bench.tests import TextPresenceTest, save_tests
 LABEL_WIDTH = 8  # fixed width for printing labels
 # Uses a gemini prompt to get the most likely clean sentence from a pdf page
 last_gemini_call = time.perf_counter()
 def clean_base_sentence(pdf_path: str, page_num: int, base_sentence: str) -> str:
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
@ -58,8 +60,19 @@ Consider the sentence labeled "Base" above in the document image attached. What
        contents=contents,
        config=generate_content_config,
    )
-    result = response.candidates[0].content.parts[0].text
+
-    return result
+    # Basic rate limitting
    global last_gemini_call
    if time.perf_counter() - last_gemini_call < 6:
        time.sleep(6 - (time.perf_counter() - last_gemini_call))
    last_gemini_call = time.perf_counter()
    # Return response
    if response is not None and response.candidates is not None and len(response.candidates) > 0:
        return response.candidates[0].content.parts[0].text
    else:
        return None
 def parse_sentences(text: str) -> list[str]:
@ -111,11 +124,9 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
                    best_ratio = ratio
                    best_candidate = c_sentence  # Keep original capitalization for output
            best_candidate = best_candidate.strip()
            # Append the candidate if it passes the similarity threshold (e.g., 0.7)
            if best_ratio > 0.7 and best_candidate is not None:
-                votes.append(best_candidate)
+                votes.append(best_candidate.strip())
        # Only consider variants that differ when compared case-insensitively
        variant_votes = [vote for vote in votes if vote.lower() != b_sentence.lower()]
@ -175,7 +186,7 @@ def main():
    parser.add_argument(
        "--max-diffs",
        type=int,
-        default=3,
+        default=4,
        help="Maximum number of diffs to display per file."
    )
    parser.add_argument(
@ -215,10 +226,9 @@ def main():
        all_tests.extend(tests)
        print("")
        # Output test candidates for review after each file, in case there are errors
        save_tests(all_tests, args.output)
        break
    # Output test candidates for review
    save_tests(all_tests, args.output)
 if __name__ == "__main__":
    main()
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import Tuple
 import json
 from dataclasses import dataclass, asdict
 from enum import Enum
 from typing import List, Tuple, Optional
 from fuzzysearch import find_near_matches
 from rapidfuzz import fuzz
@ -12,189 +12,189 @@ class TestType(str, Enum):
    ABSENT = "absent"
    ORDER = "order"
 class TestChecked(str, Enum):
    VERIFIED = "verified"
    REJECTED = "rejected"
 class ValidationError(Exception):
-    """Exception raised for validation errors"""
+    """Exception raised for validation errors."""
    pass
-@dataclass
+@dataclass(kw_only=True)
 class BasePDFTest:
-    """Base class for all PDF test types"""
+    """
    Base class for all PDF test types.
    Attributes:
        pdf: The PDF filename.
        page: The page number for the test.
        id: Unique identifier for the test.
        type: The type of test.
        threshold: A float between 0 and 1 representing the threshold for fuzzy matching.
    """
    pdf: str
    page: int
    id: str
    type: str
-    threshold: float
+    threshold: float = 1.0
-    
+    checked: Optional[TestChecked] = None
    def __post_init__(self):
        # Validate common fields
        if not self.pdf:
            raise ValidationError("PDF filename cannot be empty")
        if not self.id:
            raise ValidationError("Test ID cannot be empty")
        if not isinstance(self.threshold, float) or not (0 <= self.threshold <= 1):
            raise ValidationError(f"Threshold must be a float between 0 and 1, got {self.threshold}")
-        
+        if self.type not in {t.value for t in TestType}:
        # Check that type is valid
        if self.type not in [t.value for t in TestType]:
            raise ValidationError(f"Invalid test type: {self.type}")
-    
+
    def run(self, md_content: str) -> Tuple[bool, str]:
        """
-        Run the test on the content of the provided .md file.
+        Run the test on the provided markdown content.
-        Returns a tuple (passed, explanation) where 'passed' is True if the test passes,
+        
-        and 'explanation' is a short message explaining the failure when the test does not pass.
+        Args:
            md_content: The content of the .md file.
        Returns:
            A tuple (passed, explanation) where 'passed' is True if the test passes,
            and 'explanation' provides details when the test fails.
        """
-        raise NotImplementedError("Subclasses must implement run method")
+        raise NotImplementedError("Subclasses must implement the run method")
@dataclass
 class TextPresenceTest(BasePDFTest):
-    """Test for text presence or absence in a PDF"""
+    """
-    text: str
+    Test to verify the presence or absence of specific text in a PDF.
    Attributes:
        text: The text string to search for.
    """
    text: str
    def __post_init__(self):
        super().__post_init__()
-        
+        if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}:
        # Additional validation for this specific test type
        if self.type not in [TestType.PRESENT.value, TestType.ABSENT.value]:
            raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}")
        if not self.text.strip():
            raise ValidationError("Text field cannot be empty")
-    
+
    def run(self, md_content: str) -> Tuple[bool, str]:
        reference_query = self.text
        threshold = self.threshold
        best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0
-        
+
        if self.type == TestType.PRESENT.value:
            if best_ratio >= threshold:
-                return (True, "")
+                return True, ""
            else:
-                return (False, f"Expected '{reference_query[:40]}...' with threshold {threshold} but best match ratio was {best_ratio:.3f}")
+                msg = (
-        else:  # absent
+                    f"Expected '{reference_query[:40]}...' with threshold {threshold} "
                    f"but best match ratio was {best_ratio:.3f}"
                )
                return False, msg
        else:  # ABSENT
            if best_ratio < threshold:
-                return (True, "")
+                return True, ""
            else:
-                return (False, f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} but best match ratio was {best_ratio:.3f}")
+                msg = (
                    f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} "
                    f"but best match ratio was {best_ratio:.3f}"
                )
                return False, msg
@dataclass
 class TextOrderTest(BasePDFTest):
-    """Test for text order in a PDF"""
+    """
    Test to verify that one text appears before another in a PDF.
    Attributes:
        before: The text expected to appear first.
        after: The text expected to appear after the 'before' text.
    """
    before: str
    after: str
-    
+
    def __post_init__(self):
        super().__post_init__()
        # Additional validation for this specific test type
        if self.type != TestType.ORDER.value:
            raise ValidationError(f"Invalid type for TextOrderTest: {self.type}")
        if not self.before.strip():
            raise ValidationError("Before field cannot be empty")
        if not self.after.strip():
            raise ValidationError("After field cannot be empty")
-    
+
    def run(self, md_content: str) -> Tuple[bool, str]:
        before = self.before
        after = self.after
        threshold = self.threshold
-        max_l_dist = round((1.0 - threshold) * len(before))
+        max_l_dist = round((1.0 - threshold) * len(self.before))
-        
+        before_matches = find_near_matches(self.before, md_content, max_l_dist=max_l_dist)
-        before_matches = find_near_matches(before, md_content, max_l_dist=max_l_dist)
+        after_matches = find_near_matches(self.after, md_content, max_l_dist=max_l_dist)
-        after_matches = find_near_matches(after, md_content, max_l_dist=max_l_dist)
+
        if not before_matches:
-            return (False, f"'before' search text '{before[:40]}...' not found with max_l_dist {max_l_dist}")
+            return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {max_l_dist}"
        if not after_matches:
-            return (False, f"'after' search text '{after[:40]}...' not found with max_l_dist {max_l_dist}")
+            return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {max_l_dist}"
-        
+
        for before_match in before_matches:
            for after_match in after_matches:
                if before_match.start < after_match.start:
-                    return (True, "")
+                    return True, ""
-        
+        return False, (
-        return (False, f"Could not find a location where '{before[:40]}...' appears before '{after[:40]}...'.")
+            f"Could not find a location where '{self.before[:40]}...' appears before "
            f"'{self.after[:40]}...'."
        )
-def load_tests(jsonl_file: str) -> list[BasePDFTest]:
+def load_tests(jsonl_file: str) -> List[BasePDFTest]:
-    """Load tests from a JSONL file"""
+    """
-    tests = []
+    Load tests from a JSONL file.
-    with open(jsonl_file, 'r') as file:
+    Args:
-        for line_number, line in enumerate(file, 1):
+        jsonl_file: Path to the JSONL file containing test definitions.
    Returns:
        A list of test objects.
    """
    tests: List[BasePDFTest] = []
    with open(jsonl_file, "r") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
-            if not line:  # Skip empty lines
+            if not line:
                continue
-                
+
            try:
                # Parse the JSON object
                data = json.loads(line)
-                
+                test_type = data.get("type")
-                # Based on the type field, create the appropriate test object
+                if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
-                if data["type"] in [TestType.PRESENT.value, TestType.ABSENT.value]:
+                    test = TextPresenceTest(**data)
-                    test = TextPresenceTest(
+                elif test_type == TestType.ORDER.value:
-                        pdf=data["pdf"],
+                    test = TextOrderTest(**data)
                        page=data["page"],
                        id=data["id"],
                        type=data["type"],
                        threshold=data["threshold"],
                        text=data["text"]
                    )
                elif data["type"] == TestType.ORDER.value:
                    test = TextOrderTest(
                        pdf=data["pdf"],
                        page=data["page"],
                        id=data["id"],
                        type=data["type"],
                        threshold=data["threshold"],
                        before=data["before"],
                        after=data["after"]
                    )
                else:
-                    raise ValidationError(f"Unknown test type: {data['type']}")
+                    raise ValidationError(f"Unknown test type: {test_type}")
-                    
+
                tests.append(test)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON on line {line_number}: {e}")
-            except ValidationError as e:
+            except (ValidationError, KeyError) as e:
-                print(f"Validation error on line {line_number}: {e}")
+                print(f"Error on line {line_number}: {e}")
            except KeyError as e:
                print(f"Missing required field on line {line_number}: {e}")
            except Exception as e:
                print(f"Unexpected error on line {line_number}: {e}")
-                
+
    return tests
-def save_tests(tests: list[BasePDFTest], jsonl_file: str) -> None:
+def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None:
-    """Save tests to a JSONL file"""
+    """
-    with open(jsonl_file, 'w') as file:
+    Save tests to a JSONL file using asdict for conversion.
    Args:
        tests: A list of test objects.
        jsonl_file: Path to the output JSONL file.
    """
    with open(jsonl_file, "w") as file:
        for test in tests:
-            # Convert dataclass to dict
+            file.write(json.dumps(asdict(test)) + "\n")
            if isinstance(test, TextPresenceTest):
                data = {
                    "pdf": test.pdf,
                    "id": test.id,
                    "type": test.type,
                    "threshold": test.threshold,
                    "text": test.text
                }
            elif isinstance(test, TextOrderTest):
                data = {
                    "pdf": test.pdf,
                    "id": test.id,
                    "type": test.type,
                    "threshold": test.threshold,
                    "before": test.before,
                    "after": test.after
                }
            file.write(json.dumps(data) + '\n')
--- a/olmocr/bench/viewer.py
+++ b/olmocr/bench/viewer.py
@ -2,10 +2,14 @@
 import json
 import sys
 import os
 import re
 import argparse
-from collections import defaultdict
+import requests
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from collections import defaultdict
 from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
 from olmocr.data.renderpdf import render_pdf_to_base64png
 def parse_rules_file(file_path):
@ -31,6 +35,7 @@ def parse_rules_file(file_path):
    return pdf_rules
 def get_rule_html(rule, rule_index):
    """Generate HTML representation for a rule with interactive elements."""
    rule_type = rule.get('type', 'unknown')
@ -38,7 +43,6 @@ def get_rule_html(rule, rule_index):
    # Determine status button class based on 'checked' value
    checked_status = rule.get('checked')
    # We won't set active class here; it'll be updated by JS upon interaction.
    thumbs_up_class = "active" if checked_status == "verified" else ""
    thumbs_down_class = "active" if checked_status == "rejected" else ""
@ -121,6 +125,7 @@ def get_rule_html(rule, rule_index):
        </tr>
        """
 def generate_html(pdf_rules, rules_file_path):
    """Generate the HTML page with PDF renderings and interactive rules."""
    # Limit to 10 unique PDFs
@ -380,28 +385,24 @@ def generate_html(pdf_rules, rules_file_path):
        </div>
        """
-    # Add JavaScript to manage interactivity
+    # Add JavaScript to manage interactivity and datastore integration
    html += f"""
        </div>
        <script>
-            // Store all rules data
+            // Store all rules data (initially injected from the JSON file)
            let rulesData = {rules_json};
            // Function to toggle status button
            function toggleStatus(button) {{
                // Find the closest rule row which holds the rule index
                const ruleRow = button.closest('.rule-row');
                const ruleIndex = parseInt(ruleRow.dataset.ruleIndex);
                // Determine which action was clicked (either 'verified' or 'rejected')
                const action = button.dataset.action;
                // Toggle the rule's checked state: if already in that state, set to null; otherwise, set to the clicked action.
                const currentState = rulesData[ruleIndex].checked;
                const newState = (currentState === action) ? null : action;
                rulesData[ruleIndex].checked = newState;
-                // Update the UI: adjust active classes on buttons in this row
+                // Update UI for status buttons
                const buttons = ruleRow.querySelectorAll('.status-button');
                buttons.forEach(btn => {{
                    if (btn.dataset.action === newState) {{
@ -411,6 +412,8 @@ def generate_html(pdf_rules, rules_file_path):
                    }}
                }});
                // Upload updated data to datastore
                uploadRulesData();
                outputJSON();
            }}
@ -421,10 +424,11 @@ def generate_html(pdf_rules, rules_file_path):
                const field = element.dataset.field;
                const newText = element.innerText.trim();
-                // Update rules data
+                // Update the rules data
                rulesData[ruleIndex][field] = newText;
-                // Output updated JSONL to console
+                // Upload updated data to datastore
                uploadRulesData();
                outputJSON();
            }}
@ -437,8 +441,53 @@ def generate_html(pdf_rules, rules_file_path):
                }});
            }}
-            // Output initial JSONL when page loads
+            // Function to upload rulesData to datastore using putDatastore
-            document.addEventListener('DOMContentLoaded', outputJSON);
+            async function uploadRulesData() {{
                try {{
                    await putDatastore(rulesData);
                    console.log("Datastore updated successfully");
                }} catch (error) {{
                    console.error("Failed to update datastore", error);
                }}
            }}
            // Function to update UI from rulesData (used after fetching datastore state)
            function updateUIFromRulesData() {{
                document.querySelectorAll('.rule-row').forEach(ruleRow => {{
                    const ruleIndex = parseInt(ruleRow.dataset.ruleIndex);
                    const rule = rulesData[ruleIndex];
                    // Update status buttons
                    const buttons = ruleRow.querySelectorAll('.status-button');
                    buttons.forEach(btn => {{
                        if (btn.dataset.action === rule.checked) {{
                            btn.classList.add('active');
                        }} else {{
                            btn.classList.remove('active');
                        }}
                    }});
                    // Update editable text fields
                    ruleRow.querySelectorAll('.editable-text').forEach(div => {{
                        const field = div.dataset.field;
                        if (rule[field] !== undefined) {{
                            div.innerText = rule[field];
                        }}
                    }});
                }});
            }}
            // On page load, fetch data from the datastore and update UI accordingly
            document.addEventListener('DOMContentLoaded', async function() {{
                try {{
                    const datastoreState = await fetchDatastore();
                    if (datastoreState.length) {{
                        rulesData = datastoreState;
                        updateUIFromRulesData();
                        outputJSON();
                    }}
                }} catch (error) {{
                    console.error("Error fetching datastore", error);
                }}
            }});
        </script>
    </body>
    </html>
@ -446,6 +495,30 @@ def generate_html(pdf_rules, rules_file_path):
    return html
 def get_page_datastore(html: str):
    """
    Fetch the JSON datastore from the presigned URL.
    Returns a dict. If any error or no content, returns {}.
    """
    match = re.search(r"const presignedGetUrl = \"(.*?)\";", html)
    if not match:
        return None
    presigned_url = match.group(1)
    try:
        # Clean up the presigned URL (sometimes the signature may need re-encoding)
        url_parts = urlsplit(presigned_url)
        query_params = parse_qs(url_parts.query)
        encoded_query = urlencode(query_params, doseq=True)
        cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment))
        resp = requests.get(cleaned_url)
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"Error fetching datastore from {presigned_url}: {e}")
        return None
 def main():
    parser = argparse.ArgumentParser(description='Generate an interactive HTML visualization of PDF rules.')
    parser.add_argument('rules_file', help='Path to the rules file (JSON lines format)')
@ -459,8 +532,21 @@ def main():
    if os.path.exists(args.output):
        print(f"Output file {args.output} already exists, attempting to reload it's datastore")
        with open(args.output, "r") as df:
            datastore = get_page_datastore(df.read())
        if datastore is None:
            print(f"Datastore for {args.output} is empty, please run tinyhost and verify your rules and then rerun the script")
            sys.exit(1)
        print(f"Loaded {len(datastore)} entries from datastore, updating {args.rules_file}")
        with open(args.rules_file, 'w') as of:
            for rule in datastore:
                of.write(json.dumps(rule) + "\n")
        return
    pdf_rules = parse_rules_file(args.rules_file)
    html = generate_html(pdf_rules, args.rules_file)
@ -469,5 +555,6 @@ def main():
    print(f"Interactive HTML visualization created: {args.output}")
 if __name__ == "__main__":
    main()