Working on some progress for the autominer, fixing more options in convert script

This commit is contained in:
Jake Poznanski 2025-02-27 16:02:48 -08:00
parent 11e89dcd22
commit a03673e126
3 changed files with 88 additions and 20 deletions

View File

@ -13,14 +13,20 @@ def parse_method_arg(method_arg):
Parse a method configuration string of the form:
method_name[:key=value[:key2=value2...]]
Returns:
(method_name, kwargs_dict)
(method_name, kwargs_dict, folder_name)
"""
parts = method_arg.split(":")
name = parts[0]
kwargs = {}
folder_name = name # Default folder name is the method name
for extra in parts[1:]:
if "=" in extra:
key, value = extra.split("=", 1)
if key == "name":
folder_name = value
continue
try:
converted = int(value)
except ValueError:
@ -31,14 +37,16 @@ def parse_method_arg(method_arg):
kwargs[key] = converted
else:
raise ValueError(f"Extra argument '{extra}' is not in key=value format")
return name, kwargs
return name, kwargs, folder_name
async def process_pdfs(config, pdf_directory, data_directory, repeats):
"""Process PDFs with both sync and async functions"""
for candidate in config.keys():
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
candidate_output_dir = os.path.join(data_directory, candidate)
folder_name = config[candidate]["folder_name"]
candidate_output_dir = os.path.join(data_directory, folder_name)
os.makedirs(candidate_output_dir, exist_ok=True)
method = config[candidate]["method"]
@ -64,7 +72,9 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run PDF conversion using specified OCR methods and extra parameters.")
parser.add_argument("methods", nargs="+", help="Methods to run in the format method[:key=value ...]. " "Example: gotocr mineru:temperature=2 marker:runs=3")
parser.add_argument("methods", nargs="+", help="Methods to run in the format method[:key=value ...]. "
"Example: gotocr mineru:temperature=2 marker:runs=3. "
"Use 'name=folder_name' to specify a custom output folder name.")
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
args = parser.parse_args()
@ -80,14 +90,14 @@ if __name__ == "__main__":
# Build config by importing only requested methods.
config = {}
for method_arg in args.methods:
method_name, extra_kwargs = parse_method_arg(method_arg)
method_name, extra_kwargs, folder_name = parse_method_arg(method_arg)
if method_name not in available_methods:
parser.error(f"Unknown method: {method_name}. " f"Available methods: {', '.join(available_methods.keys())}")
module_path, function_name = available_methods[method_name]
# Dynamically import the module and get the function.
module = importlib.import_module(module_path)
function = getattr(module, function_name)
config[method_name] = {"method": function, "kwargs": extra_kwargs}
config[method_name] = {"method": function, "kwargs": extra_kwargs, "folder_name": folder_name}
data_directory = os.path.join(os.path.dirname(__file__), "sample_data")
pdf_directory = os.path.join(data_directory, "pdfs")

View File

@ -6,6 +6,56 @@ from collections import Counter
import syntok.segmenter as segmenter
import syntok.tokenizer as tokenizer
import base64
import os
from google import genai
from google.genai import types
from olmocr.data.renderpdf import render_pdf_to_base64png
# Uses a gemini prompt to
def clean_base_sentence(pdf_path: str, page_num: int, base_sentence: str) -> str:
client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
)
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
image_part = glm.Part(
inline_data=glm.Blob(
mime_type="image/png",
data=base64.b64decode(image_base64)
)
)
model = "gemini-2.0-flash-thinking-exp-01-21"
contents = [
types.Content(
role="user",
parts=[
image_part,
types.Part.from_text(
text="""Base {base_sentence}
Consider the sentence labeled \"Base\" above in the document image attached. What is the correct reading of this document within the image of the page? I need it to be exact down to the individual character and that's very important to get right. It needs to match the picture, not the provided text. Please just output the correct full sentence exactly how it appears in the document image and nothing else."""
),
],
),
]
generate_content_config = types.GenerateContentConfig(
temperature=0.7,
top_p=0.95,
top_k=64,
max_output_tokens=500,
response_mime_type="text/plain",
)
response = client.generate_content(request)
result = response.candidates[0].content.parts[0].text
return result
def parse_sentences(text: str) -> list[str]:
"""
Splits a text into a list of sentence strings using syntok.
@ -23,7 +73,8 @@ def parse_sentences(text: str) -> list[str]:
sentences.append(sentence_str)
return sentences
def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
def compare_votes_for_file(base_pdf_file: str, base_pdf_page: int, base_text: str, candidate_texts: list[str]) -> None:
"""
For each sentence in the base text, finds the best matching sentence from
each candidate text (using a similarity threshold). If any candidate sentences
@ -37,6 +88,8 @@ def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
candidate_sentences_list = [parse_sentences(ct) for ct in candidate_texts]
for b_sentence in base_sentences:
b_sentence = b_sentence.replace("\n", " ")
votes = []
for c_sentences in candidate_sentences_list:
best_ratio = 0.0
@ -65,6 +118,9 @@ def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
print(f"{count}x: {variant}")
print("-" * 40)
cleaned = clean_base_sentence(base_pdf_file, base_pdf_page, b_sentence)
print("Clean", cleaned)
def main():
parser = argparse.ArgumentParser(
description="Compares sentences from base and candidate texts, printing differences."
@ -100,9 +156,11 @@ def main():
with open(base_file_path, "r", encoding="utf-8") as f:
base_text = f.read()
base_pdf_file = os.path.join(os.path.dirname(base_file_path), "..", "pdfs", os.path.basename(base_file_path).replace(".md", ".pdf"))
base_pdf_page = 1
print(f"Results for base file: {bf}")
compare_votes_for_file(base_text, candidate_texts)
print("=" * 80)
compare_votes_for_file(base_pdf_file, base_pdf_page, base_text, candidate_texts)
print("")
if __name__ == "__main__":
main()

View File

@ -1,13 +1,13 @@
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_00", "type": "present", "text": "Corporate social responsibility and the tobacco industry: hope or hype?", "threshold": 0.99}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_01", "type": "present", "text": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_02", "type": "present", "text": "The term \"corporate social responsibility\" is in vogue at the moment but as a concept it is vague and means different things to different people.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_03", "type": "present", "text": "Over the past three decades increasing pressure from non-governmental", "threshold": 1.0}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_04", "type": "absent", "text": "Downloaded from http://tobaccocontrol.bmj.com/", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_10", "type": "order", "before": "Corporate social responsibility and the tobacco industry: hope or hype?", "after": "The unprecedented expansion of power and influence of TNCs over the past three decades has accelerated global trade and development, but also environmental damage and abuses of", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_11", "type": "order", "before": "It now looks like that with vigilance", "after": "this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths", "threshold": 0.95}
{"pdf": "multi_column_miss.pdf", "page": 1, "id": "multi_column_miss_12", "type": "order", "before": "Corporate social responsibility (CSR) emerged from a realisation among transnational corporations", "after": " perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility.", "threshold": 0.95}
{"pdf": "discoverworld_crazy_table4.pdf", "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_00", "type": "present", "text": "Table 4: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_01", "type": "present", "text": "Table 5: Baseline model performance on each of the three scoring metrics", "threshold": 1.0}
{"pdf": "discoverworld_crazy_table4.pdf", "page": 1, "id": "discoverworld_crazy_table4_02", "type": "present", "text": "We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide", "threshold": 0.99}