| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  | import random | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | import tempfile | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  | from concurrent.futures import ThreadPoolExecutor | 
					
						
							|  |  |  | from difflib import SequenceMatcher | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | from urllib.parse import urlparse | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import boto3 | 
					
						
							|  |  |  | from jinja2 import Template | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | from tqdm import tqdm | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-27 18:30:41 +00:00
										 |  |  | from olmocr.data.renderpdf import render_pdf_to_base64png | 
					
						
							| 
									
										
										
										
											2024-10-09 16:57:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | session = boto3.Session(profile_name="s2") | 
					
						
							|  |  |  | s3_client = session.client("s3") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  | def generate_diff_html(a, b): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Generates HTML with differences between strings a and b. | 
					
						
							|  |  |  |     Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     seq_matcher = SequenceMatcher(None, a, b) | 
					
						
							|  |  |  |     output_html = "" | 
					
						
							|  |  |  |     for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes(): | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         if opcode == "equal": | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |             output_html += a[a0:a1] | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         elif opcode == "insert": | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |             output_html += f"<span class='added'>{b[b0:b1]}</span>" | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         elif opcode == "delete": | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |             output_html += f"<span class='removed'>{a[a0:a1]}</span>" | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         elif opcode == "replace": | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |             output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>" | 
					
						
							|  |  |  |     return output_html | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  | def process_entry(i, entry): | 
					
						
							|  |  |  |     # Randomly decide whether to display gold on the left or right | 
					
						
							|  |  |  |     if random.choice([True, False]): | 
					
						
							|  |  |  |         left_text, right_text = entry["gold_text"], entry["eval_text"] | 
					
						
							|  |  |  |         left_class, right_class = "gold", "eval" | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |         left_metadata, right_metadata = entry.get("gold_metadata", ""), entry.get("eval_metadata", "") | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |     else: | 
					
						
							|  |  |  |         left_text, right_text = entry["eval_text"], entry["gold_text"] | 
					
						
							|  |  |  |         left_class, right_class = "eval", "gold" | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |         left_metadata, right_metadata = entry.get("eval_metadata", ""), entry.get("gold_metadata", "") | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |     # Generate diff for right_text compared to left_text | 
					
						
							|  |  |  |     diff_html = generate_diff_html(left_text, right_text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |     left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>" | 
					
						
							|  |  |  |     right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>" | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |     diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>" | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     parsed_url = urlparse(entry["s3_path"]) | 
					
						
							|  |  |  |     bucket = parsed_url.netloc | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |     s3_key = parsed_url.path.lstrip("/") | 
					
						
							|  |  |  |     signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-09 16:57:13 +00:00
										 |  |  |     with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: | 
					
						
							|  |  |  |         pdf_path = tmp_pdf.name | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         bucket, key = entry["s3_path"].replace("s3://", "").split("/", 1) | 
					
						
							| 
									
										
										
										
											2024-10-09 16:57:13 +00:00
										 |  |  |         s3_client.download_file(bucket, key, pdf_path) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024) | 
					
						
							| 
									
										
										
										
											2024-10-09 16:57:13 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "entry_id": i, | 
					
						
							| 
									
										
										
										
											2024-10-09 16:57:13 +00:00
										 |  |  |         "page_image": page_image_base64, | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |         "s3_path": entry["s3_path"], | 
					
						
							|  |  |  |         "page": entry["page"], | 
					
						
							| 
									
										
										
										
											2025-01-14 22:57:17 +00:00
										 |  |  |         "key": entry.get("entry_key", entry["s3_path"] + "_" + str(entry["page"])), | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |         "alignment": entry["alignment"], | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |         "signed_pdf_link": signed_pdf_link, | 
					
						
							| 
									
										
										
										
											2025-01-14 22:40:56 +00:00
										 |  |  |         "left_metadata": left_metadata, | 
					
						
							|  |  |  |         "right_metadata": right_metadata, | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |         "left_text": left_text, | 
					
						
							|  |  |  |         "right_text": right_text, | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "diff_text": diff_html, | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |         "left_class": left_class, | 
					
						
							|  |  |  |         "right_class": right_class, | 
					
						
							|  |  |  |         "gold_class": "gold" if left_class == "gold" else "eval", | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         "eval_class": "eval" if right_class == "eval" else "gold", | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | def create_review_html(data, filename="review_page.html"): | 
					
						
							|  |  |  |     # Load the Jinja2 template from the file | 
					
						
							| 
									
										
										
										
											2024-10-09 17:53:26 +00:00
										 |  |  |     template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html") | 
					
						
							|  |  |  |     with open(template_path, "r") as f: | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  |         template = Template(f.read()) | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  |     entries = [] | 
					
						
							| 
									
										
										
										
											2024-10-01 16:46:35 +00:00
										 |  |  |     with ThreadPoolExecutor() as executor: | 
					
						
							|  |  |  |         # Submit tasks to the executor | 
					
						
							|  |  |  |         futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Process the results as they are completed | 
					
						
							|  |  |  |         for future in tqdm(futures): | 
					
						
							|  |  |  |             entries.append(future.result()) | 
					
						
							| 
									
										
										
										
											2024-09-24 21:57:51 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Render the template with the entries | 
					
						
							|  |  |  |     final_html = template.render(entries=entries) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Write the HTML content to the specified file | 
					
						
							|  |  |  |     with open(filename, "w") as f: | 
					
						
							|  |  |  |         f.write(final_html) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print(f"HTML file '{filename}' created successfully!") |