Review page size option, fixing mkdirs in convertsilver script

This commit is contained in:
Jake Poznanski 2024-10-02 15:53:21 +00:00
parent 276465aab1
commit 73fb81ef6c
3 changed files with 47 additions and 13 deletions

View File

@ -180,7 +180,7 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):
return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) -> tuple[float, list[dict]]:
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str, review_page_size: int) -> tuple[float, list[dict]]:
gold_data = load_gold_data(gold_data_path)
total_alignment_score = 0
@ -237,10 +237,10 @@ def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) ->
# Select the top 20 lowest alignments
page_eval_data.sort(key=lambda x: x["alignment"])
create_review_html(page_eval_data[:20], filename=review_page_name + "_worst.html")
create_review_html(page_eval_data[:review_page_size], filename=review_page_name + "_worst.html")
# Select random entries to return in the page_eval_data
page_eval_data = random.sample(page_eval_data, 20)
page_eval_data = random.sample(page_eval_data, review_page_size)
create_review_html(page_eval_data, filename=review_page_name + "_sample.html")
@ -256,6 +256,12 @@ if __name__ == "__main__":
default="review_page",
help="What name to give to this evaluation/comparison"
)
parser.add_argument(
'--review_size',
default=20,
type=int,
help="Number of entries to show on the generated review page",
)
parser.add_argument(
'gold_data_path',
type=str,
@ -269,4 +275,4 @@ if __name__ == "__main__":
args = parser.parse_args()
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name)
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name, review_page_size=args.review_size)

View File

@ -36,17 +36,21 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
elif pdf_engine == "pymupdf":
return _get_pymupdf(local_pdf_path, page)
elif pdf_engine == "topcoherency":
options = [
_get_pdftotext(local_pdf_path, page),
_get_pymupdf(local_pdf_path, page),
_get_pdfium(local_pdf_path, page),
_get_pypdf_raw(local_pdf_path, page)
]
options = {
"pdftotext": _get_pdftotext(local_pdf_path, page),
"pymupdf": _get_pymupdf(local_pdf_path, page),
"pdfium": _get_pdfium(local_pdf_path, page),
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page)
}
scores = [get_document_coherency(text) for text in options]
scores = {label: get_document_coherency(text) for label, text in options.items()}
# return option with the best (highest) score (higher is more likley, as these are logprobs)
return options[scores.index(max(scores))]
best_option_label = max(scores, key=scores.get)
best_option = options[best_option_label]
print(f"topcoherency chosen: {best_option_label}")
return best_option
elif pdf_engine == "pdfreport":
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
else:

View File

@ -4,6 +4,7 @@ import re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import sys
import os
import logging
import smart_open
@ -63,6 +64,25 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
if match:
raw_page_text = match.group(1).strip()
# Ok, now we want to try to see if it's better if we recalculate the anchor text
goldkey = obj["custom_id"]
s3_path = goldkey[:goldkey.rindex("-")]
page = int(goldkey[goldkey.rindex("-") + 1:])
# Save the pdf to a temporary cache folder
import os
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)
if not os.path.exists(local_pdf_path):
print("Loading pdf", s3_path)
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
fout.write(fin.read())
from pdelfin.prompts.anchor import get_anchor_text
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="topcoherency")
from pdelfin.prompts import build_openai_silver_data_prompt
obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)
@ -74,6 +94,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
except Exception as e:
logging.exception(e)
logging.error(f"Failed to process file {input_file}: {e}")
@ -191,6 +212,9 @@ def main():
output_dir = args.output_dir.rstrip('/')
max_jobs = args.jobs
if not output_dir.startswith("s3:"):
os.makedirs(output_dir, exist_ok=True)
# List input files
input_files = list_input_files(input_dir)