mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-17 11:12:33 +00:00
merge
This commit is contained in:
parent
241e5bfe70
commit
7a50ee1645
BIN
ocr_pareto.pdf
Normal file
BIN
ocr_pareto.pdf
Normal file
Binary file not shown.
BIN
ocr_pareto.png
Normal file
BIN
ocr_pareto.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 273 KiB |
61
scripts/parse_with_pdfminer.py
Normal file
61
scripts/parse_with_pdfminer.py
Normal file
@ -0,0 +1,61 @@
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTChar, LTFigure, LTImage, LTTextBox, LTTextLine
|
||||
|
||||
|
||||
def extract_chars_with_transforms(pdf_path, page_num=0):
|
||||
"""
|
||||
Extract characters with transformation data for a specific page in a PDF.
|
||||
|
||||
Args:
|
||||
pdf_path (str): Path to the PDF file
|
||||
page_num (int): Page number to extract (0-indexed)
|
||||
"""
|
||||
print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}")
|
||||
char_count = 0
|
||||
|
||||
# Extract only the specified page
|
||||
for i, page_layout in enumerate(extract_pages(pdf_path)):
|
||||
if i == page_num:
|
||||
print(f"Processing page {page_num + 1}")
|
||||
|
||||
# Recursively process all elements
|
||||
def process_element(element, level=0):
|
||||
nonlocal char_count
|
||||
indent = " " * level
|
||||
|
||||
if isinstance(element, LTChar):
|
||||
char = element.get_text()
|
||||
matrix = element.matrix
|
||||
font = element.fontname if hasattr(element, "fontname") else "Unknown"
|
||||
size = element.size if hasattr(element, "size") else "Unknown"
|
||||
|
||||
print(f"{indent}Character: '{char}'")
|
||||
print(f"{indent}Transform Matrix: {matrix}")
|
||||
print(f"{indent}Font: {font}, Size: {size}")
|
||||
print(f"{indent}{'-' * 30}")
|
||||
char_count += 1
|
||||
|
||||
# For container elements, process their children
|
||||
if hasattr(element, "_objs"):
|
||||
for obj in element._objs:
|
||||
process_element(obj, level + 1)
|
||||
|
||||
# Process all elements in the page
|
||||
for element in page_layout:
|
||||
process_element(element)
|
||||
|
||||
break # Stop after processing the requested page
|
||||
|
||||
print(f"\nTotal characters extracted: {char_count}")
|
||||
|
||||
if char_count == 0:
|
||||
print("No characters were extracted. This could mean:")
|
||||
print(f"1. Page {page_num + 1} doesn't exist or is empty")
|
||||
print("2. The PDF contains scanned images rather than text")
|
||||
print("3. The text is embedded in a way PDFMiner can't extract")
|
||||
|
||||
|
||||
# Usage
|
||||
|
||||
pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf"
|
||||
extract_chars_with_transforms(pdf_path)
|
0
test_croissant.py
Normal file
0
test_croissant.py
Normal file
Loading…
x
Reference in New Issue
Block a user