olmocr/scripts/parse_with_pdfminer.py
Jake Poznanski 61d427ebf3 Repo cleanup
2025-05-28 17:08:25 +00:00

62 lines
2.2 KiB
Python

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar
def extract_chars_with_transforms(pdf_path, page_num=0):
"""
Extract characters with transformation data for a specific page in a PDF.
Args:
pdf_path (str): Path to the PDF file
page_num (int): Page number to extract (0-indexed)
"""
print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}")
char_count = 0
# Extract only the specified page
for i, page_layout in enumerate(extract_pages(pdf_path)):
if i == page_num:
print(f"Processing page {page_num + 1}")
# Recursively process all elements
def process_element(element, level=0):
nonlocal char_count
indent = " " * level
if isinstance(element, LTChar):
char = element.get_text()
matrix = element.matrix
font = element.fontname if hasattr(element, "fontname") else "Unknown"
size = element.size if hasattr(element, "size") else "Unknown"
print(f"{indent}Character: '{char}'")
print(f"{indent}Transform Matrix: {matrix}")
print(f"{indent}Font: {font}, Size: {size}")
print(f"{indent}{'-' * 30}")
char_count += 1
# For container elements, process their children
if hasattr(element, "_objs"):
for obj in element._objs:
process_element(obj, level + 1)
# Process all elements in the page
for element in page_layout:
process_element(element)
break # Stop after processing the requested page
print(f"\nTotal characters extracted: {char_count}")
if char_count == 0:
print("No characters were extracted. This could mean:")
print(f"1. Page {page_num + 1} doesn't exist or is empty")
print("2. The PDF contains scanned images rather than text")
print("3. The text is embedded in a way PDFMiner can't extract")
# Usage
pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf"
extract_chars_with_transforms(pdf_path)