| 
									
										
										
										
											2025-05-25 22:09:05 -07:00
										 |  |  | from pdfminer.high_level import extract_pages | 
					
						
							| 
									
										
										
										
											2025-05-28 17:08:25 +00:00
										 |  |  | from pdfminer.layout import LTChar | 
					
						
							| 
									
										
										
										
											2025-05-25 22:09:05 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def extract_chars_with_transforms(pdf_path, page_num=0): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Extract characters with transformation data for a specific page in a PDF. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         pdf_path (str): Path to the PDF file | 
					
						
							|  |  |  |         page_num (int): Page number to extract (0-indexed) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}") | 
					
						
							|  |  |  |     char_count = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Extract only the specified page | 
					
						
							|  |  |  |     for i, page_layout in enumerate(extract_pages(pdf_path)): | 
					
						
							|  |  |  |         if i == page_num: | 
					
						
							|  |  |  |             print(f"Processing page {page_num + 1}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Recursively process all elements | 
					
						
							|  |  |  |             def process_element(element, level=0): | 
					
						
							|  |  |  |                 nonlocal char_count | 
					
						
							|  |  |  |                 indent = "  " * level | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 if isinstance(element, LTChar): | 
					
						
							|  |  |  |                     char = element.get_text() | 
					
						
							|  |  |  |                     matrix = element.matrix | 
					
						
							|  |  |  |                     font = element.fontname if hasattr(element, "fontname") else "Unknown" | 
					
						
							|  |  |  |                     size = element.size if hasattr(element, "size") else "Unknown" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                     print(f"{indent}Character: '{char}'") | 
					
						
							|  |  |  |                     print(f"{indent}Transform Matrix: {matrix}") | 
					
						
							|  |  |  |                     print(f"{indent}Font: {font}, Size: {size}") | 
					
						
							|  |  |  |                     print(f"{indent}{'-' * 30}") | 
					
						
							|  |  |  |                     char_count += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # For container elements, process their children | 
					
						
							|  |  |  |                 if hasattr(element, "_objs"): | 
					
						
							|  |  |  |                     for obj in element._objs: | 
					
						
							|  |  |  |                         process_element(obj, level + 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Process all elements in the page | 
					
						
							|  |  |  |             for element in page_layout: | 
					
						
							|  |  |  |                 process_element(element) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             break  # Stop after processing the requested page | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     print(f"\nTotal characters extracted: {char_count}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if char_count == 0: | 
					
						
							|  |  |  |         print("No characters were extracted. This could mean:") | 
					
						
							|  |  |  |         print(f"1. Page {page_num + 1} doesn't exist or is empty") | 
					
						
							|  |  |  |         print("2. The PDF contains scanned images rather than text") | 
					
						
							|  |  |  |         print("3. The text is embedded in a way PDFMiner can't extract") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Usage | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf" | 
					
						
							|  |  |  | extract_chars_with_transforms(pdf_path) |