| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | import base64 | 
					
						
							| 
									
										
										
										
											2024-10-14 21:37:14 +00:00
										 |  |  | import glob | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  | import io | 
					
						
							|  |  |  | import json | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  | import re | 
					
						
							|  |  |  | import tempfile | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | import unittest | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | from pypdf import PdfReader | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | from olmocr.data.renderpdf import ( | 
					
						
							|  |  |  |     get_pdf_media_box_width_height, | 
					
						
							|  |  |  |     render_pdf_to_base64png, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  | from olmocr.image_utils import convert_image_to_pdf_bytes | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | from olmocr.prompts.anchor import _linearize_pdf_report, _pdf_report, get_anchor_text | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-07 17:01:59 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | class AnchorTest(unittest.TestCase): | 
					
						
							|  |  |  |     def testExtractText(self): | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf") | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  |         reader = PdfReader(local_pdf_path) | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         page = reader.pages[0] | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         def visitor_body(text, cm, tm, font_dict, font_size): | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |             print(repr(text), cm, tm, font_size) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         def visitor_op(op, args, cm, tm): | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |             # print(op, args, cm, tm) | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |             pass | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op) | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def testAnchorBase(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         report = _pdf_report(local_pdf_path, 2) | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         print(report) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")) | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |     def testAnchorImage(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf") | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-02 16:44:39 +00:00
										 |  |  |         report = _pdf_report(local_pdf_path, 1) | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 23:15:53 +00:00
										 |  |  |         print(report) | 
					
						
							| 
									
										
										
										
											2024-10-01 22:10:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-02 22:17:15 +00:00
										 |  |  |         print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def testSmallPage(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         report = _pdf_report(local_pdf_path, 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(report) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-07 17:01:59 +00:00
										 |  |  |     def testBadUTFSurrogatePairsGeneration(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         jsondata = json.dumps({"text": anchor_text}) | 
					
						
							| 
									
										
										
										
											2024-10-07 17:01:59 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         import pyarrow as pa | 
					
						
							|  |  |  |         import pyarrow.compute as pc | 
					
						
							| 
									
										
										
										
											2025-01-29 15:25:10 -08:00
										 |  |  |         import pyarrow.json as paj | 
					
						
							| 
									
										
										
										
											2024-10-07 17:01:59 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         buffer = io.BytesIO(jsondata.encode("utf-8")) | 
					
						
							| 
									
										
										
										
											2024-10-07 17:01:59 +00:00
										 |  |  |         paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata))) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  |     def testLargePromptHint1(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 1000) | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def testLargePromptHint2(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-09 16:29:20 +00:00
										 |  |  |     def testLargePromptHint3(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-10-09 16:29:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  |     def testNewsPaperPromptHint(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-16 16:05:44 +00:00
										 |  |  |     def testTobaccoPaperMissingParagraphs(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-10-16 16:05:44 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-23 22:17:20 +00:00
										 |  |  |     def testAnchorOtherLengths(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=2000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 2000) | 
					
						
							| 
									
										
										
										
											2024-10-23 22:17:20 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2024-10-30 16:26:02 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 6000) | 
					
						
							| 
									
										
										
										
											2024-10-23 22:17:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-17 17:00:02 +00:00
										 |  |  |     def testFailingAnchor(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:51:04 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-10-08 21:23:21 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-23 22:17:20 +00:00
										 |  |  |     def testEmptyAnchor(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |     def testEmptyAnchorMatchesImageAnchor(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "edgar.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         orig_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport") | 
					
						
							|  |  |  |         print(orig_anchor) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         lenneg1_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=-1) | 
					
						
							|  |  |  |         print(lenneg1_anchor) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         base64_png = render_pdf_to_base64png(local_pdf_path, 1, target_longest_image_dim=1024) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Decode base64 and save to temporary file | 
					
						
							|  |  |  |         temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) | 
					
						
							|  |  |  |         temp_img.write(base64.b64decode(base64_png)) | 
					
						
							|  |  |  |         temp_img.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Convert all images to a single PDF using our enhanced function | 
					
						
							|  |  |  |         pdf_bytes = convert_image_to_pdf_bytes([temp_img.name]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Write the PDF bytes to a temporary file | 
					
						
							|  |  |  |         temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) | 
					
						
							|  |  |  |         temp_pdf.write(pdf_bytes) | 
					
						
							|  |  |  |         temp_pdf.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Update pdf_path to the new file | 
					
						
							|  |  |  |         img_pdf_path = temp_pdf.name | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         image_only_anchor = get_anchor_text(img_pdf_path, 1, pdf_engine="pdfreport") | 
					
						
							|  |  |  |         print(image_only_anchor) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Parse page dimensions from both anchors and check with tolerance | 
					
						
							|  |  |  |         # Extract page dimensions and image bounds | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  |         img_lines = image_only_anchor.strip().split("\n") | 
					
						
							|  |  |  |         len_lines = lenneg1_anchor.strip().split("\n") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         img_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", img_lines[0]) | 
					
						
							|  |  |  |         img_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", img_lines[1]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         len_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", len_lines[0]) | 
					
						
							|  |  |  |         len_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", len_lines[1]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         self.assertIsNotNone(img_page_match, f"Could not parse image anchor page dims: {image_only_anchor}") | 
					
						
							|  |  |  |         self.assertIsNotNone(img_image_match, f"Could not parse image anchor image dims: {image_only_anchor}") | 
					
						
							|  |  |  |         self.assertIsNotNone(len_page_match, f"Could not parse lenneg1 anchor page dims: {lenneg1_anchor}") | 
					
						
							|  |  |  |         self.assertIsNotNone(len_image_match, f"Could not parse lenneg1 anchor image dims: {lenneg1_anchor}") | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         img_page_w, img_page_h = float(img_page_match.group(1)), float(img_page_match.group(2)) | 
					
						
							|  |  |  |         img_img_w, img_img_h = int(img_image_match.group(1)), int(img_image_match.group(2)) | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         len_page_w, len_page_h = float(len_page_match.group(1)), float(len_page_match.group(2)) | 
					
						
							|  |  |  |         len_img_w, len_img_h = int(len_image_match.group(1)), int(len_image_match.group(2)) | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         # Check page dimensions are within 1.4 tolerance | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  |         self.assertAlmostEqual(img_page_w, len_page_w, delta=1.4, msg=f"Page width mismatch: {img_page_w} vs {len_page_w}") | 
					
						
							|  |  |  |         self.assertAlmostEqual(img_page_h, len_page_h, delta=1.4, msg=f"Page height mismatch: {img_page_h} vs {len_page_h}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         # Check image dimensions are within 1 point tolerance | 
					
						
							| 
									
										
										
										
											2025-05-29 23:23:02 +00:00
										 |  |  |         self.assertAlmostEqual(img_img_w, len_img_w, delta=1, msg=f"Image width mismatch: {img_img_w} vs {len_img_w}") | 
					
						
							|  |  |  |         self.assertAlmostEqual(img_img_h, len_img_h, delta=1, msg=f"Image height mismatch: {img_img_h} vs {len_img_h}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-28 18:27:40 +00:00
										 |  |  |         self.assertEqual(image_only_anchor[:5], lenneg1_anchor[:5]) | 
					
						
							|  |  |  |         self.assertEqual(image_only_anchor[-1:], lenneg1_anchor[-1:]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-20 19:37:00 +00:00
										 |  |  |     def testCannotLoad(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "load_v_error.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         reader = PdfReader(local_pdf_path) | 
					
						
							|  |  |  |         page = 5 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=6000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:51:04 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 6000) | 
					
						
							| 
									
										
										
										
											2024-11-20 19:37:00 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:15:10 -08:00
										 |  |  |     @unittest.skip("TODO, this unit test still fails, the map text is too large.") | 
					
						
							| 
									
										
										
										
											2024-11-18 09:03:24 -08:00
										 |  |  |     def testExcessiveMapAnchor(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:51:04 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 4000) | 
					
						
							| 
									
										
										
										
											2024-11-18 09:03:24 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-11 18:45:42 +00:00
										 |  |  |     def testKyleOnePageAnchors1(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "dolma-page-1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:51:04 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 6000) | 
					
						
							| 
									
										
										
										
											2025-02-11 18:45:42 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def testKyleOnePageAnchors2(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "olmo-page-1.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							|  |  |  |         print(len(anchor_text)) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:51:04 +00:00
										 |  |  |         self.assertLessEqual(len(anchor_text), 6000) | 
					
						
							| 
									
										
										
										
											2025-02-11 18:45:42 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-02 22:17:15 +00:00
										 |  |  | class BuildSilverTest(unittest.TestCase): | 
					
						
							|  |  |  |     def testSmallPage(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-27 18:30:41 +00:00
										 |  |  |         from olmocr.data.buildsilver import build_page_query | 
					
						
							| 
									
										
										
										
											2024-10-02 22:17:15 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         result = build_page_query(local_pdf_path, "s3://test.pdf", 1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-27 18:30:41 +00:00
										 |  |  |         from olmocr.data.renderpdf import get_png_dimensions_from_base64 | 
					
						
							| 
									
										
										
										
											2024-10-02 22:17:15 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if base64data.startswith("data:image/png;base64,"): | 
					
						
							|  |  |  |             base64data = base64data[22:] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         width, height = get_png_dimensions_from_base64(base64data) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(width, height) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 21:37:14 +00:00
										 |  |  |         assert max(width, height) == 2048 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 21:37:14 +00:00
										 |  |  | class TestRenderPdf(unittest.TestCase): | 
					
						
							|  |  |  |     def testFastMediaBoxMatchesPyPdf(self): | 
					
						
							|  |  |  |         for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")): | 
					
						
							|  |  |  |             reader = PdfReader(file) | 
					
						
							|  |  |  |             print("checking", file) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 21:37:14 +00:00
										 |  |  |             for page_num in range(1, len(reader.pages) + 1): | 
					
						
							|  |  |  |                 w1, h1 = get_pdf_media_box_width_height(file, page_num) | 
					
						
							|  |  |  |                 pypdfpage = reader.pages[page_num - 1] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-25 09:13:13 -08:00
										 |  |  |                 self.assertAlmostEqual(w1, pypdfpage.mediabox.width, places=3) | 
					
						
							| 
									
										
										
										
											2025-01-10 19:38:42 +00:00
										 |  |  |                 self.assertAlmostEqual(h1, pypdfpage.mediabox.height, places=3) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-10 19:38:42 +00:00
										 |  |  | class TestOutputSamplePage(unittest.TestCase): | 
					
						
							|  |  |  |     def testTobaccoPaper(self): | 
					
						
							|  |  |  |         local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf") | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         anchor_text = get_anchor_text(local_pdf_path, 1, "pdfreport", target_length=6000) | 
					
						
							| 
									
										
										
										
											2025-01-10 19:38:42 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         print("") | 
					
						
							|  |  |  |         print(anchor_text) | 
					
						
							| 
									
										
										
										
											2025-01-29 15:30:39 -08:00
										 |  |  |         print("") |