Rewriting prompts to eval with new model

This commit is contained in:
Jake Poznanski 2024-10-09 16:04:39 +00:00
parent 7c19a9a856
commit b6b74b7832
2 changed files with 19 additions and 9 deletions

View File

@ -7,8 +7,9 @@ import sys
import logging
import smart_open
from cached_path import cached_path
from pdelfin.prompts import build_finetuning_prompt
from pdelfin.prompts.anchor import get_anchor_text
# Import Plotly for plotting
import plotly.express as px
@ -91,11 +92,26 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
if match:
raw_page_text = match.group(1).strip()
# Ok, now we want to try to see if it's better if we recalculate the anchor text
goldkey = obj["custom_id"]
s3_path = goldkey[:goldkey.rindex("-")]
page = int(goldkey[goldkey.rindex("-") + 1:])
# Save the pdf to a temporary cache folder
local_pdf_path = cached_path(s3_path, quiet=True)
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
transformed["chat_messages"][0]["content"][0]["text"] = build_finetuning_prompt(raw_page_text)
if transformed is not None:
prompt_text = transformed["chat_messages"][0]["content"][0]["text"]
prompt_length = len(prompt_text)
if prompt_length > 6000:
print(transformed["custom_id"], "length ", prompt_length)
prompt_lengths.append(prompt_length)
outfile.write(json.dumps(transformed) + '\n')

View File

@ -8,7 +8,7 @@ import os
import logging
import smart_open
from cached_path import cached_path
from pdelfin.prompts import build_finetuning_prompt
@ -71,13 +71,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
page = int(goldkey[goldkey.rindex("-") + 1:])
# Save the pdf to a temporary cache folder
import os
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)
if not os.path.exists(local_pdf_path):
print("Loading pdf", s3_path)
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
fout.write(fin.read())
local_pdf_path = cached_path(s3_path, quiet=True)
from pdelfin.prompts.anchor import get_anchor_text