mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-01 18:43:45 +00:00
Review page size option, fixing mkdirs in convertsilver script
This commit is contained in:
parent
276465aab1
commit
73fb81ef6c
@ -180,7 +180,7 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):
|
||||
|
||||
return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data
|
||||
|
||||
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) -> tuple[float, list[dict]]:
|
||||
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str, review_page_size: int) -> tuple[float, list[dict]]:
|
||||
gold_data = load_gold_data(gold_data_path)
|
||||
|
||||
total_alignment_score = 0
|
||||
@ -237,10 +237,10 @@ def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) ->
|
||||
|
||||
# Select the top 20 lowest alignments
|
||||
page_eval_data.sort(key=lambda x: x["alignment"])
|
||||
create_review_html(page_eval_data[:20], filename=review_page_name + "_worst.html")
|
||||
create_review_html(page_eval_data[:review_page_size], filename=review_page_name + "_worst.html")
|
||||
|
||||
# Select random entries to return in the page_eval_data
|
||||
page_eval_data = random.sample(page_eval_data, 20)
|
||||
page_eval_data = random.sample(page_eval_data, review_page_size)
|
||||
create_review_html(page_eval_data, filename=review_page_name + "_sample.html")
|
||||
|
||||
|
||||
@ -256,6 +256,12 @@ if __name__ == "__main__":
|
||||
default="review_page",
|
||||
help="What name to give to this evaluation/comparison"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--review_size',
|
||||
default=20,
|
||||
type=int,
|
||||
help="Number of entries to show on the generated review page",
|
||||
)
|
||||
parser.add_argument(
|
||||
'gold_data_path',
|
||||
type=str,
|
||||
@ -269,4 +275,4 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name)
|
||||
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name, review_page_size=args.review_size)
|
||||
@ -36,17 +36,21 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
|
||||
elif pdf_engine == "pymupdf":
|
||||
return _get_pymupdf(local_pdf_path, page)
|
||||
elif pdf_engine == "topcoherency":
|
||||
options = [
|
||||
_get_pdftotext(local_pdf_path, page),
|
||||
_get_pymupdf(local_pdf_path, page),
|
||||
_get_pdfium(local_pdf_path, page),
|
||||
_get_pypdf_raw(local_pdf_path, page)
|
||||
]
|
||||
options = {
|
||||
"pdftotext": _get_pdftotext(local_pdf_path, page),
|
||||
"pymupdf": _get_pymupdf(local_pdf_path, page),
|
||||
"pdfium": _get_pdfium(local_pdf_path, page),
|
||||
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page)
|
||||
}
|
||||
|
||||
scores = [get_document_coherency(text) for text in options]
|
||||
scores = {label: get_document_coherency(text) for label, text in options.items()}
|
||||
|
||||
# return option with the best (highest) score (higher is more likley, as these are logprobs)
|
||||
return options[scores.index(max(scores))]
|
||||
best_option_label = max(scores, key=scores.get)
|
||||
best_option = options[best_option_label]
|
||||
|
||||
print(f"topcoherency chosen: {best_option_label}")
|
||||
|
||||
return best_option
|
||||
elif pdf_engine == "pdfreport":
|
||||
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
|
||||
else:
|
||||
|
||||
@ -4,6 +4,7 @@ import re
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
|
||||
import smart_open
|
||||
@ -63,6 +64,25 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
||||
|
||||
if match:
|
||||
raw_page_text = match.group(1).strip()
|
||||
|
||||
# Ok, now we want to try to see if it's better if we recalculate the anchor text
|
||||
goldkey = obj["custom_id"]
|
||||
s3_path = goldkey[:goldkey.rindex("-")]
|
||||
page = int(goldkey[goldkey.rindex("-") + 1:])
|
||||
|
||||
# Save the pdf to a temporary cache folder
|
||||
import os
|
||||
local_pdf_path = "/home/ubuntu/.cache/samplepdfs/" + os.path.basename(s3_path)
|
||||
|
||||
if not os.path.exists(local_pdf_path):
|
||||
print("Loading pdf", s3_path)
|
||||
with smart_open.smart_open(s3_path, "rb") as fin, open(local_pdf_path, "wb") as fout:
|
||||
fout.write(fin.read())
|
||||
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
|
||||
raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="topcoherency")
|
||||
|
||||
from pdelfin.prompts import build_openai_silver_data_prompt
|
||||
obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)
|
||||
|
||||
@ -74,6 +94,7 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
||||
|
||||
logging.info(f"Processed '{input_file}': {processed_count} records transformed, {error_count} errors.")
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
logging.error(f"Failed to process file {input_file}: {e}")
|
||||
|
||||
|
||||
@ -191,6 +212,9 @@ def main():
|
||||
output_dir = args.output_dir.rstrip('/')
|
||||
max_jobs = args.jobs
|
||||
|
||||
if not output_dir.startswith("s3:"):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# List input files
|
||||
input_files = list_input_files(input_dir)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user