mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-27 16:12:13 +00:00
Rewriting prompts to eval with new model
This commit is contained in:
parent
7c19a9a856
commit
b6b74b7832
@ -7,8 +7,9 @@ import sys
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import smart_open
|
import smart_open
|
||||||
|
from cached_path import cached_path
|
||||||
from pdelfin.prompts import build_finetuning_prompt
|
from pdelfin.prompts import build_finetuning_prompt
|
||||||
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
|
|
||||||
# Import Plotly for plotting
|
# Import Plotly for plotting
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
@ -91,11 +92,26 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
|||||||
|
|
||||||
if match:
|
if match:
|
||||||
raw_page_text = match.group(1).strip()
|
raw_page_text = match.group(1).strip()
|
||||||
|
|
||||||
|
# Ok, now we want to try to see if it's better if we recalculate the anchor text
|
||||||
|
goldkey = obj["custom_id"]
|
||||||
|
s3_path = goldkey[:goldkey.rindex("-")]
|
||||||
|
page = int(goldkey[goldkey.rindex("-") + 1:])
|
||||||
|
|
||||||
|
# Save the pdf to a temporary cache folder
|
||||||
|
local_pdf_path = cached_path(s3_path, quiet=True)
|
||||||
|
|
||||||
|
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||||
transformed["chat_messages"][0]["content"][0]["text"] = build_finetuning_prompt(raw_page_text)
|
transformed["chat_messages"][0]["content"][0]["text"] = build_finetuning_prompt(raw_page_text)
|
||||||
|
|
||||||
|
|
||||||
if transformed is not None:
|
if transformed is not None:
|
||||||
prompt_text = transformed["chat_messages"][0]["content"][0]["text"]
|
prompt_text = transformed["chat_messages"][0]["content"][0]["text"]
|
||||||
prompt_length = len(prompt_text)
|
prompt_length = len(prompt_text)
|
||||||
|
|
||||||
|
if prompt_length > 6000:
|
||||||
|
print(transformed["custom_id"], "length ", prompt_length)
|
||||||
|
|
||||||
prompt_lengths.append(prompt_length)
|
prompt_lengths.append(prompt_length)
|
||||||
|
|
||||||
outfile.write(json.dumps(transformed) + '\n')
|
outfile.write(json.dumps(transformed) + '\n')
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import smart_open
|
import smart_open
|
||||||
|
from cached_path import cached_path
|
||||||
from pdelfin.prompts import build_finetuning_prompt
|
from pdelfin.prompts import build_finetuning_prompt
|
||||||
|
|
||||||
|
|
||||||
@ -71,13 +71,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
|||||||
page = int(goldkey[goldkey.rindex("-") + 1:])
|
page = int(goldkey[goldkey.rindex("-") + 1:])
|
||||||
|
|
||||||
# Save the pdf to a temporary cache folder
|
# Save the pdf to a temporary cache folder
|
||||||
import os
|
local_pdf_path = cached_path(s3_path, quiet=True)
|
||||||
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)
|
|
||||||
|
|
||||||
if not os.path.exists(local_pdf_path):
|
|
||||||
print("Loading pdf", s3_path)
|
|
||||||
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
|
|
||||||
fout.write(fin.read())
|
|
||||||
|
|
||||||
from pdelfin.prompts.anchor import get_anchor_text
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user