mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-24 22:49:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			62 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			62 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from pdfminer.high_level import extract_pages
 | |
| from pdfminer.layout import LTChar
 | |
| 
 | |
| 
 | |
| def extract_chars_with_transforms(pdf_path, page_num=0):
 | |
|     """
 | |
|     Extract characters with transformation data for a specific page in a PDF.
 | |
| 
 | |
|     Args:
 | |
|         pdf_path (str): Path to the PDF file
 | |
|         page_num (int): Page number to extract (0-indexed)
 | |
|     """
 | |
|     print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}")
 | |
|     char_count = 0
 | |
| 
 | |
|     # Extract only the specified page
 | |
|     for i, page_layout in enumerate(extract_pages(pdf_path)):
 | |
|         if i == page_num:
 | |
|             print(f"Processing page {page_num + 1}")
 | |
| 
 | |
|             # Recursively process all elements
 | |
|             def process_element(element, level=0):
 | |
|                 nonlocal char_count
 | |
|                 indent = "  " * level
 | |
| 
 | |
|                 if isinstance(element, LTChar):
 | |
|                     char = element.get_text()
 | |
|                     matrix = element.matrix
 | |
|                     font = element.fontname if hasattr(element, "fontname") else "Unknown"
 | |
|                     size = element.size if hasattr(element, "size") else "Unknown"
 | |
| 
 | |
|                     print(f"{indent}Character: '{char}'")
 | |
|                     print(f"{indent}Transform Matrix: {matrix}")
 | |
|                     print(f"{indent}Font: {font}, Size: {size}")
 | |
|                     print(f"{indent}{'-' * 30}")
 | |
|                     char_count += 1
 | |
| 
 | |
|                 # For container elements, process their children
 | |
|                 if hasattr(element, "_objs"):
 | |
|                     for obj in element._objs:
 | |
|                         process_element(obj, level + 1)
 | |
| 
 | |
|             # Process all elements in the page
 | |
|             for element in page_layout:
 | |
|                 process_element(element)
 | |
| 
 | |
|             break  # Stop after processing the requested page
 | |
| 
 | |
|     print(f"\nTotal characters extracted: {char_count}")
 | |
| 
 | |
|     if char_count == 0:
 | |
|         print("No characters were extracted. This could mean:")
 | |
|         print(f"1. Page {page_num + 1} doesn't exist or is empty")
 | |
|         print("2. The PDF contains scanned images rather than text")
 | |
|         print("3. The text is embedded in a way PDFMiner can't extract")
 | |
| 
 | |
| 
 | |
| # Usage
 | |
| 
 | |
| pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf"
 | |
| extract_chars_with_transforms(pdf_path)
 | 
