| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | import argparse | 
					
						
							|  |  |  | import dataclasses | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | import functools | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | import random | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  | from concurrent.futures import ProcessPoolExecutor, as_completed | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | from itertools import combinations | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | import boto3 | 
					
						
							|  |  |  | from dolma_refine.evaluate.aligners import HirschbergAligner | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | from dolma_refine.evaluate.metrics import DocumentEditSimilarity | 
					
						
							|  |  |  | from dolma_refine.evaluate.segmenters import SpacySegmenter | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | from tqdm import tqdm | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-27 18:30:41 +00:00
										 |  |  | from olmocr.eval.evalhtml import create_review_html | 
					
						
							| 
									
										
										
										
											2025-01-29 15:47:57 -08:00
										 |  |  | from olmocr.s3_utils import expand_s3_glob, get_s3_bytes | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | @dataclasses.dataclass | 
					
						
							|  |  |  | class Comparison: | 
					
						
							|  |  |  |     pdf_path: str | 
					
						
							|  |  |  |     comparison_a_path: str | 
					
						
							|  |  |  |     comparison_b_path: str | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |     comparison_a_str: str | 
					
						
							|  |  |  |     comparison_b_str: str | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     alignment: float | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |     @property | 
					
						
							|  |  |  |     def comparison_a_method(self): | 
					
						
							| 
									
										
										
										
											2025-02-07 16:05:00 -08:00
										 |  |  |         match = re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_a_path) | 
					
						
							|  |  |  |         if match: | 
					
						
							|  |  |  |             return match.group(1) | 
					
						
							|  |  |  |         raise ValueError(f"No match found in path: {self.comparison_a_path}") | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def comparison_b_method(self): | 
					
						
							| 
									
										
										
										
											2025-02-07 16:05:00 -08:00
										 |  |  |         match = re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path) | 
					
						
							|  |  |  |         if match: | 
					
						
							|  |  |  |             return match.group(1) | 
					
						
							|  |  |  |         raise ValueError(f"No match found in path: {self.comparison_b_path}") | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  | def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"): | 
					
						
							|  |  |  |     """Process a single PDF and return its comparisons.""" | 
					
						
							|  |  |  |     # Create resources inside the worker process | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     s3_client = boto3.client("s3") | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     segmenter = SpacySegmenter(segmenter_name) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     aligner = HirschbergAligner(match_score=1, mismatch_score=-1, indel_score=-1) | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     pdf_comps = [] | 
					
						
							|  |  |  |     result_comps = [] | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Get all comparison files for this PDF | 
					
						
							|  |  |  |     for comp in comparisons: | 
					
						
							|  |  |  |         comp_path = pdf_path.replace(".pdf", f"_{comp}.md") | 
					
						
							|  |  |  |         if comp_path in all_mds: | 
					
						
							|  |  |  |             pdf_comps.append(comp_path) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Generate all possible combinations | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     for compa, compb in combinations(pdf_comps, 2): | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |         if random.choice([True, False]): | 
					
						
							|  |  |  |             compa, compb = compb, compa | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Get the text content | 
					
						
							|  |  |  |         text_a = get_s3_bytes(s3_client, compa).decode("utf-8") | 
					
						
							|  |  |  |         text_b = get_s3_bytes(s3_client, compb).decode("utf-8") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         result_comps.append( | 
					
						
							|  |  |  |             Comparison( | 
					
						
							|  |  |  |                 pdf_path=pdf_path, | 
					
						
							|  |  |  |                 comparison_a_path=compa, | 
					
						
							|  |  |  |                 comparison_b_path=compb, | 
					
						
							|  |  |  |                 comparison_a_str=text_a, | 
					
						
							|  |  |  |                 comparison_b_str=text_b, | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |                 alignment=comparer.compute(text_a, text_b), | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |             ) | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     return result_comps | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  | def build_review_page(args, comparisons, index=0): | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |     page_data = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for comp in comparisons: | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         page_data.append( | 
					
						
							|  |  |  |             { | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |                 "s3_path": comp.pdf_path, | 
					
						
							|  |  |  |                 "page": 1, | 
					
						
							| 
									
										
										
										
											2025-01-14 22:57:17 +00:00
										 |  |  |                 "entry_key": comp.pdf_path + "-" + comp.comparison_a_method + "-" + comp.comparison_b_method, | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |                 "gold_text": comp.comparison_a_str, | 
					
						
							|  |  |  |                 "gold_metadata": comp.comparison_a_method, | 
					
						
							|  |  |  |                 "eval_text": comp.comparison_b_str, | 
					
						
							|  |  |  |                 "eval_metadata": comp.comparison_b_method, | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |                 "alignment": comp.alignment, | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     report_name = f"{args.name}{f'_{index}' if args.num_copies > 1 else ''}.html" | 
					
						
							|  |  |  |     create_review_html(page_data, report_name) | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  | if __name__ == "__main__": | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     parser = argparse.ArgumentParser(description="Generates comparison voting pages between different pairs of parses for a PDF.") | 
					
						
							|  |  |  |     parser.add_argument("--name", default="review_page", help="What name to give to this evaluation/comparison") | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     parser.add_argument( | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "--review_size", | 
					
						
							| 
									
										
										
										
											2025-01-15 23:35:18 +00:00
										 |  |  |         default=50, | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |         type=int, | 
					
						
							|  |  |  |         help="Number of entries to show on the generated review page", | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     parser.add_argument( | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "--max_workers", | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |         type=int, | 
					
						
							|  |  |  |         default=None, | 
					
						
							|  |  |  |         help="Maximum number of worker processes to use for parallel processing", | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     parser.add_argument("--comparisons", default=["pdelf", "marker", "gotocr_format", "mineru"], help="Different variants to compare against") | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     parser.add_argument( | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "--num_copies", | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |         default=1, | 
					
						
							|  |  |  |         type=int, | 
					
						
							|  |  |  |         help="Number of reports to generate, labeled _0, _1, etc. if greater than 1", | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     parser.add_argument( | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "s3_path", type=str, help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf" | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     args = parser.parse_args() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Create S3 client only for initial file listing | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     s3_client = boto3.client("s3") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Get all PDFs and MD files | 
					
						
							| 
									
										
										
										
											2025-01-14 21:08:23 +00:00
										 |  |  |     all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf")) | 
					
						
							|  |  |  |     all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     all_comps = [] | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Create a partial function with all the common arguments | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     process_pdf = functools.partial(process_single_pdf, all_mds=all_mds, comparisons=args.comparisons) | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Use ProcessPoolExecutor for parallel processing | 
					
						
							|  |  |  |     with ProcessPoolExecutor(max_workers=args.max_workers) as executor: | 
					
						
							|  |  |  |         # Submit all PDF processing tasks | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         future_to_pdf = {executor.submit(process_pdf, pdf_path): pdf_path for pdf_path in all_pdfs} | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # Process results as they complete using tqdm for progress | 
					
						
							|  |  |  |         for future in tqdm(as_completed(future_to_pdf), total=len(all_pdfs)): | 
					
						
							|  |  |  |             pdf_path = future_to_pdf[future] | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 pdf_results = future.result() | 
					
						
							|  |  |  |                 all_comps.extend(pdf_results) | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 print(f"Error processing {pdf_path}: {str(e)}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 18:00:12 +00:00
										 |  |  |     # Remove all results where the alignment is > 0.96 as these are just too similar to be useful | 
					
						
							|  |  |  |     all_comps = [c for c in all_comps if c.alignment < 0.96] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Shuffle the results | 
					
						
							| 
									
										
										
										
											2025-01-15 23:35:18 +00:00
										 |  |  |     random.shuffle(all_comps) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-16 00:22:29 +00:00
										 |  |  |     # Generate the specified number of copies of the report | 
					
						
							|  |  |  |     for i in range(args.num_copies): | 
					
						
							|  |  |  |         start_index = i * args.review_size | 
					
						
							|  |  |  |         end_index = start_index + args.review_size | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Check if there is enough data for the next report | 
					
						
							|  |  |  |         if start_index >= len(all_comps): | 
					
						
							|  |  |  |             print(f"Not enough data to generate report {i}. Stopping early.") | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         build_review_page(args, all_comps[start_index:end_index], index=i) |