olmocr/pdelfin/eval/buildelo.py

121 lines
3.7 KiB
Python
Raw Normal View History

2025-01-14 21:08:23 +00:00
import argparse
import boto3
import dataclasses
import random
2025-01-14 22:40:56 +00:00
import re
2025-01-14 21:08:23 +00:00
2025-01-14 22:40:56 +00:00
from tqdm import tqdm
2025-01-14 21:08:23 +00:00
from itertools import combinations
from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes
from dolma_refine.evaluate.metrics import DocumentEditSimilarity
from dolma_refine.evaluate.segmenters import SpacySegmenter
from dolma_refine.evaluate.aligners import HirschbergAligner
2025-01-14 22:40:56 +00:00
from pdelfin.eval.evalhtml import create_review_html
2025-01-14 21:08:23 +00:00
s3_client = boto3.client('s3')
@dataclasses.dataclass
class Comparison:
pdf_path: str
comparison_a_path: str
comparison_b_path: str
2025-01-14 22:40:56 +00:00
comparison_a_str: str
comparison_b_str: str
2025-01-14 21:08:23 +00:00
alignment: float
2025-01-14 22:40:56 +00:00
@property
def comparison_a_method(self):
return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_a_path).group(1)
@property
def comparison_b_method(self):
return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_b_path).group(1)
def build_review_page(args, comparisons):
page_data = []
for comp in comparisons:
page_data.append({
"s3_path": comp.pdf_path,
"page": 1,
"gold_text": comp.comparison_a_str,
"gold_metadata": comp.comparison_a_method,
"eval_text": comp.comparison_b_str,
"eval_metadata": comp.comparison_b_method,
"alignment": comp.alignment
})
create_review_html(page_data, args.name + ".html")
2025-01-14 21:08:23 +00:00
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generates comparison voting pages between different pairs of parses for a PDF."
)
parser.add_argument(
'--name',
default="review_page",
help="What name to give to this evaluation/comparison"
)
parser.add_argument(
'--review_size',
default=20,
type=int,
help="Number of entries to show on the generated review page",
)
parser.add_argument(
'--comparisons',
default=["pdelf", "gotocr", "gotocr_format"],
help="Different variants to compare against"
)
parser.add_argument(
's3_path',
type=str,
help='Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf'
)
args = parser.parse_args()
segmenter = SpacySegmenter("spacy")
aligner = HirschbergAligner(match_score=1,
mismatch_score=-1,
indel_score=-1)
comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner)
all_comps = []
all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf"))
all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md"))
2025-01-14 22:40:56 +00:00
for pdf_path in tqdm(all_pdfs):
2025-01-14 21:08:23 +00:00
pdf_comps = []
for comp in args.comparisons:
comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
if comp_path in all_mds:
pdf_comps.append(comp_path)
for (compa, compb) in combinations(pdf_comps, 2):
if random.choice([True, False]):
compa, compb = compb, compa
text_a = get_s3_bytes(s3_client, compa).decode("utf-8")
text_b = get_s3_bytes(s3_client, compb).decode("utf-8")
all_comps.append(
Comparison(pdf_path=pdf_path,
comparison_a_path=compa,
comparison_b_path=compb,
2025-01-14 22:40:56 +00:00
comparison_a_str=text_a,
comparison_b_str=text_b,
2025-01-14 21:08:23 +00:00
alignment=comparer.compute(text_a, text_b)
)
)
2025-01-14 22:40:56 +00:00
# DEBUG CODE, remove
if len(all_comps) > 10:
break
all_comps.sort(key=lambda c: c.alignment)
result = build_review_page(args, all_comps[0:args.review_size])