mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-16 20:18:57 +00:00
42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
![]() |
import subprocess
|
||
|
import base64
|
||
|
import io
|
||
|
from pypdf import PdfReader
|
||
|
|
||
|
from PIL import Image
|
||
|
|
||
|
|
||
|
def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
|
||
|
pdf = PdfReader(local_pdf_path)
|
||
|
pdf_page = pdf.pages[page - 1]
|
||
|
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
|
||
|
|
||
|
# Convert PDF page to PNG using pdftoppm
|
||
|
pdftoppm_result = subprocess.run(
|
||
|
[
|
||
|
"pdftoppm",
|
||
|
"-png",
|
||
|
"-f",
|
||
|
str(page),
|
||
|
"-l",
|
||
|
str(page),
|
||
|
"-r",
|
||
|
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
||
|
local_pdf_path,
|
||
|
],
|
||
|
timeout=120,
|
||
|
stdout=subprocess.PIPE,
|
||
|
stderr=subprocess.PIPE,
|
||
|
)
|
||
|
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
||
|
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
||
|
|
||
|
|
||
|
def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
|
||
|
base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)
|
||
|
|
||
|
png_image = Image.open(io.BytesIO(base64_png.encode("utf-8")))
|
||
|
webp_output = io.BytesIO()
|
||
|
png_image.save(webp_output, format="WEBP")
|
||
|
|
||
|
return base64.b64encode(webp_output.getvalue()).decode("utf-8")
|