mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-16 12:08:13 +00:00
129 lines
4.6 KiB
Python
129 lines
4.6 KiB
Python
import subprocess
|
|
import base64
|
|
import io
|
|
from pypdf import PdfReader
|
|
from PIL import Image
|
|
|
|
|
|
def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
|
|
"""
|
|
Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
|
|
|
|
:param pdf_file: Path to the PDF file
|
|
:param page_num: The page number for which to extract MediaBox dimensions
|
|
:return: A dictionary containing MediaBox dimensions or None if not found
|
|
"""
|
|
# Construct the pdfinfo command to extract info for the specific page
|
|
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path]
|
|
|
|
# Run the command using subprocess
|
|
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
|
|
# Check if there is any error in executing the command
|
|
if result.returncode != 0:
|
|
raise ValueError(f"Error running pdfinfo: {result.stderr}")
|
|
|
|
# Parse the output to find MediaBox
|
|
output = result.stdout
|
|
media_box = None
|
|
|
|
for line in output.splitlines():
|
|
if 'MediaBox' in line:
|
|
media_box = line.split(':')[1].strip().split()
|
|
media_box = [float(x) for x in media_box]
|
|
return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
|
|
|
|
raise ValueError("MediaBox not found in the PDF info.")
|
|
|
|
|
|
def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int=2048):
|
|
longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
|
|
|
|
# Convert PDF page to PNG using pdftoppm
|
|
pdftoppm_result = subprocess.run(
|
|
[
|
|
"pdftoppm",
|
|
"-png",
|
|
"-f",
|
|
str(page_num),
|
|
"-l",
|
|
str(page_num),
|
|
"-r",
|
|
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
|
local_pdf_path,
|
|
],
|
|
timeout=120,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
|
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
|
|
|
|
|
def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
|
|
base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)
|
|
|
|
png_image = Image.open(io.BytesIO(base64.b64decode(base64_png)))
|
|
webp_output = io.BytesIO()
|
|
png_image.save(webp_output, format="WEBP")
|
|
|
|
return base64.b64encode(webp_output.getvalue()).decode("utf-8")
|
|
|
|
|
|
def get_png_dimensions_from_base64(base64_data) -> tuple[int, int]:
|
|
"""
|
|
Returns the (width, height) of a PNG image given its base64-encoded data,
|
|
without base64-decoding the entire data or loading the PNG itself
|
|
|
|
Should be really fast to support filtering
|
|
|
|
Parameters:
|
|
- base64_data (str): Base64-encoded PNG image data.
|
|
|
|
Returns:
|
|
- tuple: (width, height) of the image.
|
|
|
|
Raises:
|
|
- ValueError: If the data is not a valid PNG image or the required bytes are not found.
|
|
"""
|
|
# PNG signature is 8 bytes
|
|
png_signature_base64 = base64.b64encode(b'\x89PNG\r\n\x1a\n').decode('ascii')
|
|
if not base64_data.startswith(png_signature_base64[:8]):
|
|
raise ValueError('Not a valid PNG file')
|
|
|
|
# Positions in the binary data where width and height are stored
|
|
width_start = 16 # Byte position where width starts (0-based indexing)
|
|
width_end = 20 # Byte position where width ends (exclusive)
|
|
height_start = 20
|
|
height_end = 24
|
|
|
|
# Compute the byte range needed (from width_start to height_end)
|
|
start_byte = width_start
|
|
end_byte = height_end
|
|
|
|
# Calculate base64 character positions
|
|
# Each group of 3 bytes corresponds to 4 base64 characters
|
|
base64_start = (start_byte // 3) * 4
|
|
base64_end = ((end_byte + 2) // 3) * 4 # Add 2 to ensure we cover partial groups
|
|
|
|
# Extract the necessary base64 substring
|
|
base64_substring = base64_data[base64_start:base64_end]
|
|
|
|
# Decode only the necessary bytes
|
|
decoded_bytes = base64.b64decode(base64_substring)
|
|
|
|
# Compute the offset within the decoded bytes
|
|
offset = start_byte % 3
|
|
|
|
# Extract width and height bytes
|
|
width_bytes = decoded_bytes[offset:offset+4]
|
|
height_bytes = decoded_bytes[offset+4:offset+8]
|
|
|
|
if len(width_bytes) < 4 or len(height_bytes) < 4:
|
|
raise ValueError('Insufficient data to extract dimensions')
|
|
|
|
# Convert bytes to integers
|
|
width = int.from_bytes(width_bytes, 'big')
|
|
height = int.from_bytes(height_bytes, 'big')
|
|
|
|
return width, height |