mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-26 15:44:17 +00:00
Rewriting prompts to eval with new model
This commit is contained in:
parent
7c19a9a856
commit
b6b74b7832
@ -7,8 +7,9 @@ import sys
|
||||
import logging
|
||||
|
||||
import smart_open
|
||||
|
||||
from cached_path import cached_path
|
||||
from pdelfin.prompts import build_finetuning_prompt
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
|
||||
# Import Plotly for plotting
|
||||
import plotly.express as px
|
||||
@ -91,11 +92,26 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
||||
|
||||
if match:
|
||||
raw_page_text = match.group(1).strip()
|
||||
|
||||
# Ok, now we want to try to see if it's better if we recalculate the anchor text
|
||||
goldkey = obj["custom_id"]
|
||||
s3_path = goldkey[:goldkey.rindex("-")]
|
||||
page = int(goldkey[goldkey.rindex("-") + 1:])
|
||||
|
||||
# Save the pdf to a temporary cache folder
|
||||
local_pdf_path = cached_path(s3_path, quiet=True)
|
||||
|
||||
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||
transformed["chat_messages"][0]["content"][0]["text"] = build_finetuning_prompt(raw_page_text)
|
||||
|
||||
|
||||
if transformed is not None:
|
||||
prompt_text = transformed["chat_messages"][0]["content"][0]["text"]
|
||||
prompt_length = len(prompt_text)
|
||||
|
||||
if prompt_length > 6000:
|
||||
print(transformed["custom_id"], "length ", prompt_length)
|
||||
|
||||
prompt_lengths.append(prompt_length)
|
||||
|
||||
outfile.write(json.dumps(transformed) + '\n')
|
||||
|
||||
@ -8,7 +8,7 @@ import os
|
||||
import logging
|
||||
|
||||
import smart_open
|
||||
|
||||
from cached_path import cached_path
|
||||
from pdelfin.prompts import build_finetuning_prompt
|
||||
|
||||
|
||||
@ -71,13 +71,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
||||
page = int(goldkey[goldkey.rindex("-") + 1:])
|
||||
|
||||
# Save the pdf to a temporary cache folder
|
||||
import os
|
||||
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)
|
||||
|
||||
if not os.path.exists(local_pdf_path):
|
||||
print("Loading pdf", s3_path)
|
||||
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
|
||||
fout.write(fin.read())
|
||||
local_pdf_path = cached_path(s3_path, quiet=True)
|
||||
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user