More work on automining

This commit is contained in:
Jake Poznanski 2025-02-28 10:14:47 -08:00
parent 3670219a8f
commit 1b78ec9572
3 changed files with 29 additions and 13 deletions

View File

@ -190,6 +190,7 @@ def main():
else:
avg = 0.0
print(f" {ttype:8s}: {avg:0.1f}% average pass rate over {len(scores)} tests")
print("")
print("=" * 50)

View File

@ -1,4 +1,5 @@
import os
import re
import argparse
from difflib import SequenceMatcher
from collections import Counter
@ -13,29 +14,30 @@ from google.genai import types
from olmocr.data.renderpdf import render_pdf_to_base64png
# Uses a gemini prompt to
# Uses a gemini prompt to get the most likely clean sentence from a pdf page
def clean_base_sentence(pdf_path: str, page_num: int, base_sentence: str) -> str:
client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
)
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
image_part = glm.Part(
inline_data=glm.Blob(
image_part = types.Part(
inline_data=types.Blob(
mime_type="image/png",
data=base64.b64decode(image_base64)
)
)
model = "gemini-2.0-flash-thinking-exp-01-21"
#model = "gemini-2.0-flash-thinking-exp-01-21" # Consider using a more stable model for production
model="gemini-2.0-flash-001"
contents = [
types.Content(
role="user",
parts=[
image_part,
types.Part.from_text(
text="""Base {base_sentence}
text=f"""Base: {base_sentence}
Consider the sentence labeled \"Base\" above in the document image attached. What is the correct reading of this document within the image of the page? I need it to be exact down to the individual character and that's very important to get right. It needs to match the picture, not the provided text. Please just output the correct full sentence exactly how it appears in the document image and nothing else."""
Consider the sentence labeled "Base" above in the document image attached. What is the correct reading of this document within the image of the page? I need it to be exact down to the individual character and that's very important to get right. It needs to match the picture, not the provided text. Please just output the correct full sentence exactly how it appears in the document image and nothing else. You can merge hyphenated words back together, and don't output any new lines."""
),
],
),
@ -48,10 +50,12 @@ Consider the sentence labeled \"Base\" above in the document image attached. Wha
response_mime_type="text/plain",
)
response = client.generate_content(request)
response = client.models.generate_content(
model=model,
contents=contents,
config=generate_content_config,
)
result = response.candidates[0].content.parts[0].text
return result
@ -121,6 +125,13 @@ def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: st
cleaned = clean_base_sentence(base_pdf_file, base_pdf_page, b_sentence)
print("Clean", cleaned)
def get_pdf_from_md(md_path: str) -> str:
base = os.path.basename(md_path)
base = re.sub(r'_\d+\.md$', '.pdf', base)
return os.path.join(os.path.dirname(md_path), "..", "pdfs", base)
def main():
parser = argparse.ArgumentParser(
description="Compares sentences from base and candidate texts, printing differences."
@ -156,7 +167,7 @@ def main():
with open(base_file_path, "r", encoding="utf-8") as f:
base_text = f.read()
base_pdf_file = os.path.join(os.path.dirname(base_file_path), "..", "pdfs", os.path.basename(base_file_path).replace(".md", ".pdf"))
base_pdf_file = get_pdf_from_md(base_file_path)
base_pdf_page = 1
print(f"Results for base file: {bf}")
compare_votes_for_file(base_pdf_file, base_pdf_page, base_text, candidate_texts)

View File

@ -37,9 +37,6 @@ dependencies = [
"httpx",
"torch>=2.5.1",
"transformers>=4.46.2",
"fuzzysearch",
"rapidfuzz",
"sequence_align",
"beaker-py",
]
license = {file = "LICENSE"}
@ -77,6 +74,13 @@ dev = [
"spacy",
]
bench = [
"fuzzysearch",
"rapidfuzz",
"sequence_align",
"syntok",
"google-genai",
]
train = [
"torch",