From 9f12917e10cfdb0f03a184e5ab8ed99e1c825311 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Fri, 28 Feb 2025 14:58:29 -0800 Subject: [PATCH] Organizing things for data entry --- olmocr/bench/miners/mine_diffs.py | 30 ++-- olmocr/bench/tests.py | 220 +++++++++++++++--------------- olmocr/bench/viewer.py | 117 ++++++++++++++-- 3 files changed, 232 insertions(+), 135 deletions(-) diff --git a/olmocr/bench/miners/mine_diffs.py b/olmocr/bench/miners/mine_diffs.py index d66c054..ab693c3 100644 --- a/olmocr/bench/miners/mine_diffs.py +++ b/olmocr/bench/miners/mine_diffs.py @@ -1,5 +1,6 @@ import os import re +import time import argparse from difflib import SequenceMatcher from collections import Counter @@ -8,7 +9,6 @@ import syntok.segmenter as segmenter import syntok.tokenizer as tokenizer import base64 -import os from google import genai from google.genai import types @@ -18,6 +18,8 @@ from olmocr.bench.tests import TextPresenceTest, save_tests LABEL_WIDTH = 8 # fixed width for printing labels # Uses a gemini prompt to get the most likely clean sentence from a pdf page +last_gemini_call = time.perf_counter() + def clean_base_sentence(pdf_path: str, page_num: int, base_sentence: str) -> str: client = genai.Client( api_key=os.environ.get("GEMINI_API_KEY"), @@ -58,8 +60,19 @@ Consider the sentence labeled "Base" above in the document image attached. What contents=contents, config=generate_content_config, ) - result = response.candidates[0].content.parts[0].text - return result + + # Basic rate limitting + global last_gemini_call + if time.perf_counter() - last_gemini_call < 6: + time.sleep(6 - (time.perf_counter() - last_gemini_call)) + + last_gemini_call = time.perf_counter() + + # Return response + if response is not None and response.candidates is not None and len(response.candidates) > 0: + return response.candidates[0].content.parts[0].text + else: + return None def parse_sentences(text: str) -> list[str]: @@ -111,11 +124,9 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st best_ratio = ratio best_candidate = c_sentence # Keep original capitalization for output - best_candidate = best_candidate.strip() - # Append the candidate if it passes the similarity threshold (e.g., 0.7) if best_ratio > 0.7 and best_candidate is not None: - votes.append(best_candidate) + votes.append(best_candidate.strip()) # Only consider variants that differ when compared case-insensitively variant_votes = [vote for vote in votes if vote.lower() != b_sentence.lower()] @@ -175,7 +186,7 @@ def main(): parser.add_argument( "--max-diffs", type=int, - default=3, + default=4, help="Maximum number of diffs to display per file." ) parser.add_argument( @@ -215,10 +226,9 @@ def main(): all_tests.extend(tests) print("") + # Output test candidates for review after each file, in case there are errors + save_tests(all_tests, args.output) break - # Output test candidates for review - save_tests(all_tests, args.output) - if __name__ == "__main__": main() diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index 284c83b..8597f66 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -1,7 +1,7 @@ -from dataclasses import dataclass -from typing import Tuple import json +from dataclasses import dataclass, asdict from enum import Enum +from typing import List, Tuple, Optional from fuzzysearch import find_near_matches from rapidfuzz import fuzz @@ -12,189 +12,189 @@ class TestType(str, Enum): ABSENT = "absent" ORDER = "order" +class TestChecked(str, Enum): + VERIFIED = "verified" + REJECTED = "rejected" + class ValidationError(Exception): - """Exception raised for validation errors""" + """Exception raised for validation errors.""" pass -@dataclass +@dataclass(kw_only=True) class BasePDFTest: - """Base class for all PDF test types""" + """ + Base class for all PDF test types. + + Attributes: + pdf: The PDF filename. + page: The page number for the test. + id: Unique identifier for the test. + type: The type of test. + threshold: A float between 0 and 1 representing the threshold for fuzzy matching. + """ pdf: str page: int id: str type: str - threshold: float - + threshold: float = 1.0 + checked: Optional[TestChecked] = None + def __post_init__(self): - # Validate common fields if not self.pdf: raise ValidationError("PDF filename cannot be empty") - if not self.id: raise ValidationError("Test ID cannot be empty") - if not isinstance(self.threshold, float) or not (0 <= self.threshold <= 1): raise ValidationError(f"Threshold must be a float between 0 and 1, got {self.threshold}") - - # Check that type is valid - if self.type not in [t.value for t in TestType]: + if self.type not in {t.value for t in TestType}: raise ValidationError(f"Invalid test type: {self.type}") - + def run(self, md_content: str) -> Tuple[bool, str]: """ - Run the test on the content of the provided .md file. - Returns a tuple (passed, explanation) where 'passed' is True if the test passes, - and 'explanation' is a short message explaining the failure when the test does not pass. + Run the test on the provided markdown content. + + Args: + md_content: The content of the .md file. + + Returns: + A tuple (passed, explanation) where 'passed' is True if the test passes, + and 'explanation' provides details when the test fails. """ - raise NotImplementedError("Subclasses must implement run method") + raise NotImplementedError("Subclasses must implement the run method") @dataclass class TextPresenceTest(BasePDFTest): - """Test for text presence or absence in a PDF""" - text: str + """ + Test to verify the presence or absence of specific text in a PDF. + Attributes: + text: The text string to search for. + """ + text: str + def __post_init__(self): super().__post_init__() - - # Additional validation for this specific test type - if self.type not in [TestType.PRESENT.value, TestType.ABSENT.value]: + if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}: raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}") - if not self.text.strip(): raise ValidationError("Text field cannot be empty") - + def run(self, md_content: str) -> Tuple[bool, str]: reference_query = self.text threshold = self.threshold best_ratio = fuzz.partial_ratio(reference_query, md_content) / 100.0 - + if self.type == TestType.PRESENT.value: if best_ratio >= threshold: - return (True, "") + return True, "" else: - return (False, f"Expected '{reference_query[:40]}...' with threshold {threshold} but best match ratio was {best_ratio:.3f}") - else: # absent + msg = ( + f"Expected '{reference_query[:40]}...' with threshold {threshold} " + f"but best match ratio was {best_ratio:.3f}" + ) + return False, msg + else: # ABSENT if best_ratio < threshold: - return (True, "") + return True, "" else: - return (False, f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} but best match ratio was {best_ratio:.3f}") + msg = ( + f"Expected absence of '{reference_query[:40]}...' with threshold {threshold} " + f"but best match ratio was {best_ratio:.3f}" + ) + return False, msg @dataclass class TextOrderTest(BasePDFTest): - """Test for text order in a PDF""" + """ + Test to verify that one text appears before another in a PDF. + + Attributes: + before: The text expected to appear first. + after: The text expected to appear after the 'before' text. + """ before: str after: str - + def __post_init__(self): super().__post_init__() - - # Additional validation for this specific test type if self.type != TestType.ORDER.value: raise ValidationError(f"Invalid type for TextOrderTest: {self.type}") - if not self.before.strip(): raise ValidationError("Before field cannot be empty") - if not self.after.strip(): raise ValidationError("After field cannot be empty") - + def run(self, md_content: str) -> Tuple[bool, str]: - before = self.before - after = self.after threshold = self.threshold - max_l_dist = round((1.0 - threshold) * len(before)) - - before_matches = find_near_matches(before, md_content, max_l_dist=max_l_dist) - after_matches = find_near_matches(after, md_content, max_l_dist=max_l_dist) - + max_l_dist = round((1.0 - threshold) * len(self.before)) + before_matches = find_near_matches(self.before, md_content, max_l_dist=max_l_dist) + after_matches = find_near_matches(self.after, md_content, max_l_dist=max_l_dist) + if not before_matches: - return (False, f"'before' search text '{before[:40]}...' not found with max_l_dist {max_l_dist}") + return False, f"'before' text '{self.before[:40]}...' not found with max_l_dist {max_l_dist}" if not after_matches: - return (False, f"'after' search text '{after[:40]}...' not found with max_l_dist {max_l_dist}") - + return False, f"'after' text '{self.after[:40]}...' not found with max_l_dist {max_l_dist}" + for before_match in before_matches: for after_match in after_matches: if before_match.start < after_match.start: - return (True, "") - - return (False, f"Could not find a location where '{before[:40]}...' appears before '{after[:40]}...'.") + return True, "" + return False, ( + f"Could not find a location where '{self.before[:40]}...' appears before " + f"'{self.after[:40]}...'." + ) -def load_tests(jsonl_file: str) -> list[BasePDFTest]: - """Load tests from a JSONL file""" - tests = [] +def load_tests(jsonl_file: str) -> List[BasePDFTest]: + """ + Load tests from a JSONL file. - with open(jsonl_file, 'r') as file: - for line_number, line in enumerate(file, 1): + Args: + jsonl_file: Path to the JSONL file containing test definitions. + + Returns: + A list of test objects. + """ + tests: List[BasePDFTest] = [] + with open(jsonl_file, "r") as file: + for line_number, line in enumerate(file, start=1): line = line.strip() - if not line: # Skip empty lines + if not line: continue - + try: - # Parse the JSON object data = json.loads(line) - - # Based on the type field, create the appropriate test object - if data["type"] in [TestType.PRESENT.value, TestType.ABSENT.value]: - test = TextPresenceTest( - pdf=data["pdf"], - page=data["page"], - id=data["id"], - type=data["type"], - threshold=data["threshold"], - text=data["text"] - ) - elif data["type"] == TestType.ORDER.value: - test = TextOrderTest( - pdf=data["pdf"], - page=data["page"], - id=data["id"], - type=data["type"], - threshold=data["threshold"], - before=data["before"], - after=data["after"] - ) + test_type = data.get("type") + if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: + test = TextPresenceTest(**data) + elif test_type == TestType.ORDER.value: + test = TextOrderTest(**data) else: - raise ValidationError(f"Unknown test type: {data['type']}") - + raise ValidationError(f"Unknown test type: {test_type}") + tests.append(test) - except json.JSONDecodeError as e: print(f"Error parsing JSON on line {line_number}: {e}") - except ValidationError as e: - print(f"Validation error on line {line_number}: {e}") - except KeyError as e: - print(f"Missing required field on line {line_number}: {e}") + except (ValidationError, KeyError) as e: + print(f"Error on line {line_number}: {e}") except Exception as e: print(f"Unexpected error on line {line_number}: {e}") - + return tests -def save_tests(tests: list[BasePDFTest], jsonl_file: str) -> None: - """Save tests to a JSONL file""" - with open(jsonl_file, 'w') as file: +def save_tests(tests: List[BasePDFTest], jsonl_file: str) -> None: + """ + Save tests to a JSONL file using asdict for conversion. + + Args: + tests: A list of test objects. + jsonl_file: Path to the output JSONL file. + """ + with open(jsonl_file, "w") as file: for test in tests: - # Convert dataclass to dict - if isinstance(test, TextPresenceTest): - data = { - "pdf": test.pdf, - "id": test.id, - "type": test.type, - "threshold": test.threshold, - "text": test.text - } - elif isinstance(test, TextOrderTest): - data = { - "pdf": test.pdf, - "id": test.id, - "type": test.type, - "threshold": test.threshold, - "before": test.before, - "after": test.after - } - file.write(json.dumps(data) + '\n') \ No newline at end of file + file.write(json.dumps(asdict(test)) + "\n") diff --git a/olmocr/bench/viewer.py b/olmocr/bench/viewer.py index 75ce842..4ad458a 100644 --- a/olmocr/bench/viewer.py +++ b/olmocr/bench/viewer.py @@ -2,10 +2,14 @@ import json import sys import os +import re import argparse -from collections import defaultdict -from olmocr.data.renderpdf import render_pdf_to_base64png +import requests +from collections import defaultdict +from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit + +from olmocr.data.renderpdf import render_pdf_to_base64png def parse_rules_file(file_path): @@ -31,6 +35,7 @@ def parse_rules_file(file_path): return pdf_rules + def get_rule_html(rule, rule_index): """Generate HTML representation for a rule with interactive elements.""" rule_type = rule.get('type', 'unknown') @@ -38,7 +43,6 @@ def get_rule_html(rule, rule_index): # Determine status button class based on 'checked' value checked_status = rule.get('checked') - # We won't set active class here; it'll be updated by JS upon interaction. thumbs_up_class = "active" if checked_status == "verified" else "" thumbs_down_class = "active" if checked_status == "rejected" else "" @@ -121,6 +125,7 @@ def get_rule_html(rule, rule_index): """ + def generate_html(pdf_rules, rules_file_path): """Generate the HTML page with PDF renderings and interactive rules.""" # Limit to 10 unique PDFs @@ -380,28 +385,24 @@ def generate_html(pdf_rules, rules_file_path): """ - # Add JavaScript to manage interactivity + # Add JavaScript to manage interactivity and datastore integration html += f""" @@ -446,6 +495,30 @@ def generate_html(pdf_rules, rules_file_path): return html +def get_page_datastore(html: str): + """ + Fetch the JSON datastore from the presigned URL. + Returns a dict. If any error or no content, returns {}. + """ + match = re.search(r"const presignedGetUrl = \"(.*?)\";", html) + if not match: + return None + presigned_url = match.group(1) + + try: + # Clean up the presigned URL (sometimes the signature may need re-encoding) + url_parts = urlsplit(presigned_url) + query_params = parse_qs(url_parts.query) + encoded_query = urlencode(query_params, doseq=True) + cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment)) + + resp = requests.get(cleaned_url) + resp.raise_for_status() + return resp.json() + except Exception as e: + print(f"Error fetching datastore from {presigned_url}: {e}") + return None + def main(): parser = argparse.ArgumentParser(description='Generate an interactive HTML visualization of PDF rules.') parser.add_argument('rules_file', help='Path to the rules file (JSON lines format)') @@ -459,8 +532,21 @@ def main(): if os.path.exists(args.output): print(f"Output file {args.output} already exists, attempting to reload it's datastore") + with open(args.output, "r") as df: + datastore = get_page_datastore(df.read()) + + if datastore is None: + print(f"Datastore for {args.output} is empty, please run tinyhost and verify your rules and then rerun the script") + sys.exit(1) + + print(f"Loaded {len(datastore)} entries from datastore, updating {args.rules_file}") + + with open(args.rules_file, 'w') as of: + for rule in datastore: + of.write(json.dumps(rule) + "\n") + + return - pdf_rules = parse_rules_file(args.rules_file) html = generate_html(pdf_rules, args.rules_file) @@ -469,5 +555,6 @@ def main(): print(f"Interactive HTML visualization created: {args.output}") + if __name__ == "__main__": main()