More work on automining

2025-06-27 04:00:02 +00:00 · 2025-02-28 10:14:47 -08:00 · 2025-02-28 10:14:47 -08:00 · 1b78ec9572
commit 1b78ec9572
parent 3670219a8f
3 changed files with 29 additions and 13 deletions
--- a/olmocr/bench/benchmark.py
+++ b/olmocr/bench/benchmark.py
@ -190,6 +190,7 @@ def main():
            else:
                avg = 0.0
            print(f"    {ttype:8s}: {avg:0.1f}% average pass rate over {len(scores)} tests")
+        print("")
    print("=" * 50)


--- a/olmocr/bench/miners/automine.py
+++ b/olmocr/bench/miners/automine.py
@ -1,4 +1,5 @@
 import os
+import re
 import argparse
 from difflib import SequenceMatcher
 from collections import Counter
@ -13,29 +14,30 @@ from google.genai import types

 from olmocr.data.renderpdf import render_pdf_to_base64png

-# Uses a gemini prompt to 
+# Uses a gemini prompt to get the most likely clean sentence from a pdf page
 def clean_base_sentence(pdf_path: str, page_num: int, base_sentence: str) -> str:
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
-    image_part = glm.Part(
-        inline_data=glm.Blob(
+    image_part = types.Part(
+        inline_data=types.Blob(
            mime_type="image/png",
            data=base64.b64decode(image_base64)
        )
    )
-    model = "gemini-2.0-flash-thinking-exp-01-21"
+    #model = "gemini-2.0-flash-thinking-exp-01-21" # Consider using a more stable model for production
+    model="gemini-2.0-flash-001"
    contents = [
        types.Content(
            role="user",
            parts=[
                image_part,
                types.Part.from_text(
-                    text="""Base {base_sentence}
+                    text=f"""Base: {base_sentence}

-Consider the sentence labeled \"Base\" above in the document image attached. What is the correct reading of this document within the image of the page? I need it to be exact down to the individual character and that's very important to get right. It needs to match the picture, not the provided text. Please just output the correct full sentence exactly how it appears in the document image and nothing else."""
+Consider the sentence labeled "Base" above in the document image attached. What is the correct reading of this document within the image of the page? I need it to be exact down to the individual character and that's very important to get right. It needs to match the picture, not the provided text. Please just output the correct full sentence exactly how it appears in the document image and nothing else. You can merge hyphenated words back together, and don't output any new lines."""
                ),
            ],
        ),
@ -48,10 +50,12 @@ Consider the sentence labeled \"Base\" above in the document image attached. Wha
        response_mime_type="text/plain",
    )

-
-    response = client.generate_content(request)
+    response = client.models.generate_content(
+        model=model,
+        contents=contents,
+        config=generate_content_config,
+    )
    result = response.candidates[0].content.parts[0].text
-
    return result


@ -121,6 +125,13 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
            cleaned = clean_base_sentence(base_pdf_file, base_pdf_page, b_sentence)
            print("Clean", cleaned)

+
+def get_pdf_from_md(md_path: str) -> str:
+    base = os.path.basename(md_path)
+    base = re.sub(r'_\d+\.md$', '.pdf', base)
+
+    return os.path.join(os.path.dirname(md_path), "..", "pdfs", base)
+
 def main():
    parser = argparse.ArgumentParser(
        description="Compares sentences from base and candidate texts, printing differences."
@ -156,7 +167,7 @@ def main():
        with open(base_file_path, "r", encoding="utf-8") as f:
            base_text = f.read()

-        base_pdf_file = os.path.join(os.path.dirname(base_file_path), "..", "pdfs", os.path.basename(base_file_path).replace(".md", ".pdf"))
+        base_pdf_file = get_pdf_from_md(base_file_path)
        base_pdf_page = 1
        print(f"Results for base file: {bf}")
        compare_votes_for_file(base_pdf_file, base_pdf_page, base_text, candidate_texts)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,9 +37,6 @@ dependencies = [
  "httpx",
  "torch>=2.5.1",
  "transformers>=4.46.2",
-  "fuzzysearch",
-  "rapidfuzz",
-  "sequence_align",
  "beaker-py",
 ]
 license = {file = "LICENSE"}
@ -77,6 +74,13 @@ dev = [
    "spacy",
 ]

+bench = [
+    "fuzzysearch",
+    "rapidfuzz",
+    "sequence_align",
+    "syntok",
+    "google-genai",
+]

 train = [
    "torch",