Unifying some of the pdf rendering stuff

This commit is contained in:
Jake Poznanski 2024-10-09 16:57:13 +00:00
parent dc6440d068
commit 400e92180b
4 changed files with 58 additions and 51 deletions

View File

@ -12,36 +12,12 @@ from urllib.parse import urlparse
from PIL import Image
from tqdm import tqdm
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')
def render_pdf_to_base64png(s3_path, page):
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
pdf_path = tmp_pdf.name
bucket, key = s3_path.replace("s3://", "").split('/', 1)
s3_client.download_file(bucket, key, pdf_path)
# Render the PDF to an image, and display it in the first position
pdftoppm_result = subprocess.run(
["pdftoppm",
"-png",
"-f", str(page),
"-l", str(page),
pdf_path],
timeout=120,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
png_image = Image.open(io.BytesIO(pdftoppm_result.stdout))
webp_output = io.BytesIO()
png_image.save(webp_output, format="WEBP")
image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8")
return image_base64
def process_entry(i, entry):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
@ -62,9 +38,16 @@ def process_entry(i, entry):
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
pdf_path = tmp_pdf.name
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
s3_client.download_file(bucket, key, pdf_path)
page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)
return {
"entry_id": i,
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
"page_image": page_image_base64,
"s3_path": entry["s3_path"],
"page": entry["page"],
"signed_pdf_link": signed_pdf_link,

View File

@ -24,6 +24,10 @@ from dolma_refine.evaluate.aligners import HirschbergAligner
from .evalhtml import create_review_html
import logging
logging.getLogger("pypdf").setLevel(logging.ERROR)
CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache")

View File

@ -12,6 +12,7 @@ from typing import Generator
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from urllib.parse import urlparse
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.filter import PdfFilter
@ -22,30 +23,7 @@ TARGET_IMAGE_DIM = 2048
pdf_filter = PdfFilter()
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
pdf = PdfReader(local_pdf_path)
pdf_page = pdf.pages[page - 1]
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
# Convert PDF page to PNG using pdftoppm
pdftoppm_result = subprocess.run(
[
"pdftoppm",
"-png",
"-f",
str(page),
"-l",
str(page),
"-r",
str(TARGET_IMAGE_DIM * 72 / longest_dim),
local_pdf_path,
],
timeout=120,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit

View File

@ -0,0 +1,42 @@
import subprocess
import base64
import io
from pypdf import PdfReader
from PIL import Image
def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
pdf = PdfReader(local_pdf_path)
pdf_page = pdf.pages[page - 1]
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
# Convert PDF page to PNG using pdftoppm
pdftoppm_result = subprocess.run(
[
"pdftoppm",
"-png",
"-f",
str(page),
"-l",
str(page),
"-r",
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
local_pdf_path,
],
timeout=120,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)
png_image = Image.open(io.BytesIO(base64_png.encode("utf-8")))
webp_output = io.BytesIO()
png_image.save(webp_output, format="WEBP")
return base64.b64encode(webp_output.getvalue()).decode("utf-8")