| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | import base64 | 
					
						
							|  |  |  | import urllib.request | 
					
						
							|  |  |  | from io import BytesIO | 
					
						
							| 
									
										
										
										
											2025-10-09 22:12:02 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import torch | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | from PIL import Image | 
					
						
							|  |  |  | from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from olmocr.data.renderpdf import render_pdf_to_base64png | 
					
						
							|  |  |  | from olmocr.prompts import build_no_anchoring_v4_yaml_prompt | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     # Initialize the model | 
					
						
							|  |  |  |     model = Qwen2_5_VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-1025", torch_dtype=torch.bfloat16).eval() | 
					
						
							|  |  |  |     processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | 
					
						
							|  |  |  |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | 
					
						
							|  |  |  |     model.to(device) | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Grab a sample PDF | 
					
						
							|  |  |  |     urllib.request.urlretrieve("https://olmocr.allenai.org/papers/olmocr.pdf", "./paper.pdf") | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Render page 1 to an image | 
					
						
							|  |  |  |     image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288) | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Build the full prompt | 
					
						
							|  |  |  |     messages = [ | 
					
						
							| 
									
										
										
										
											2025-10-09 22:12:02 +00:00
										 |  |  |         { | 
					
						
							|  |  |  |             "role": "user", | 
					
						
							|  |  |  |             "content": [ | 
					
						
							|  |  |  |                 {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()}, | 
					
						
							|  |  |  |                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Apply the chat template and processor | 
					
						
							|  |  |  |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | 
					
						
							|  |  |  |     main_image = Image.open(BytesIO(base64.b64decode(image_base64))) | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     inputs = processor( | 
					
						
							|  |  |  |         text=[text], | 
					
						
							|  |  |  |         images=[main_image], | 
					
						
							|  |  |  |         padding=True, | 
					
						
							|  |  |  |         return_tensors="pt", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     inputs = {key: value.to(device) for (key, value) in inputs.items()} | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Generate the output | 
					
						
							|  |  |  |     output = model.generate( | 
					
						
							| 
									
										
										
										
											2025-10-09 22:12:02 +00:00
										 |  |  |         **inputs, | 
					
						
							|  |  |  |         temperature=0.1, | 
					
						
							|  |  |  |         max_new_tokens=50, | 
					
						
							|  |  |  |         num_return_sequences=1, | 
					
						
							|  |  |  |         do_sample=True, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 20:28:29 +00:00
										 |  |  |     # Decode the output | 
					
						
							|  |  |  |     prompt_length = inputs["input_ids"].shape[1] | 
					
						
							|  |  |  |     new_tokens = output[:, prompt_length:] | 
					
						
							| 
									
										
										
										
											2025-10-09 22:12:02 +00:00
										 |  |  |     text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True) | 
					
						
							| 
									
										
										
										
											2025-10-06 22:47:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-10-09 22:12:02 +00:00
										 |  |  |     print(text_output) |