From b6b74b7832952079a1195c1c8a5eb54979d76b95 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 9 Oct 2024 16:04:39 +0000 Subject: [PATCH] Rewriting prompts to eval with new model --- pdelfin/silver_data/convertsilver_birr.py | 18 +++++++++++++++++- pdelfin/silver_data/convertsilver_openai.py | 10 ++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pdelfin/silver_data/convertsilver_birr.py b/pdelfin/silver_data/convertsilver_birr.py index 70cc50c..ec9352b 100644 --- a/pdelfin/silver_data/convertsilver_birr.py +++ b/pdelfin/silver_data/convertsilver_birr.py @@ -7,8 +7,9 @@ import sys import logging import smart_open - +from cached_path import cached_path from pdelfin.prompts import build_finetuning_prompt +from pdelfin.prompts.anchor import get_anchor_text # Import Plotly for plotting import plotly.express as px @@ -91,11 +92,26 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): if match: raw_page_text = match.group(1).strip() + + # Ok, now we want to try to see if it's better if we recalculate the anchor text + goldkey = obj["custom_id"] + s3_path = goldkey[:goldkey.rindex("-")] + page = int(goldkey[goldkey.rindex("-") + 1:]) + + # Save the pdf to a temporary cache folder + local_pdf_path = cached_path(s3_path, quiet=True) + + raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") transformed["chat_messages"][0]["content"][0]["text"] = build_finetuning_prompt(raw_page_text) + if transformed is not None: prompt_text = transformed["chat_messages"][0]["content"][0]["text"] prompt_length = len(prompt_text) + + if prompt_length > 6000: + print(transformed["custom_id"], "length ", prompt_length) + prompt_lengths.append(prompt_length) outfile.write(json.dumps(transformed) + '\n') diff --git a/pdelfin/silver_data/convertsilver_openai.py b/pdelfin/silver_data/convertsilver_openai.py index 7e84470..a79d499 100644 --- a/pdelfin/silver_data/convertsilver_openai.py +++ b/pdelfin/silver_data/convertsilver_openai.py @@ -8,7 +8,7 @@ import os import logging import smart_open - +from cached_path import cached_path from pdelfin.prompts import build_finetuning_prompt @@ -71,13 +71,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): page = int(goldkey[goldkey.rindex("-") + 1:]) # Save the pdf to a temporary cache folder - import os - local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path) - - if not os.path.exists(local_pdf_path): - print("Loading pdf", s3_path) - with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout: - fout.write(fin.read()) + local_pdf_path = cached_path(s3_path, quiet=True) from pdelfin.prompts.anchor import get_anchor_text