mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-28 07:34:13 +00:00
Unifying some of the pdf rendering stuff
This commit is contained in:
parent
dc6440d068
commit
400e92180b
@ -12,36 +12,12 @@ from urllib.parse import urlparse
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
|
||||
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
session = boto3.Session(profile_name='s2')
|
||||
s3_client = session.client('s3')
|
||||
|
||||
|
||||
def render_pdf_to_base64png(s3_path, page):
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||||
pdf_path = tmp_pdf.name
|
||||
bucket, key = s3_path.replace("s3://", "").split('/', 1)
|
||||
s3_client.download_file(bucket, key, pdf_path)
|
||||
|
||||
# Render the PDF to an image, and display it in the first position
|
||||
pdftoppm_result = subprocess.run(
|
||||
["pdftoppm",
|
||||
"-png",
|
||||
"-f", str(page),
|
||||
"-l", str(page),
|
||||
pdf_path],
|
||||
timeout=120,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
||||
|
||||
png_image = Image.open(io.BytesIO(pdftoppm_result.stdout))
|
||||
webp_output = io.BytesIO()
|
||||
png_image.save(webp_output, format="WEBP")
|
||||
|
||||
image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8")
|
||||
|
||||
return image_base64
|
||||
|
||||
|
||||
def process_entry(i, entry):
|
||||
# Randomly decide whether to display gold on the left or right
|
||||
if random.choice([True, False]):
|
||||
@ -62,9 +38,16 @@ def process_entry(i, entry):
|
||||
s3_key = parsed_url.path.lstrip('/')
|
||||
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||||
pdf_path = tmp_pdf.name
|
||||
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
|
||||
s3_client.download_file(bucket, key, pdf_path)
|
||||
|
||||
page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)
|
||||
|
||||
return {
|
||||
"entry_id": i,
|
||||
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
|
||||
"page_image": page_image_base64,
|
||||
"s3_path": entry["s3_path"],
|
||||
"page": entry["page"],
|
||||
"signed_pdf_link": signed_pdf_link,
|
||||
|
||||
@ -24,6 +24,10 @@ from dolma_refine.evaluate.aligners import HirschbergAligner
|
||||
|
||||
from .evalhtml import create_review_html
|
||||
|
||||
import logging
|
||||
|
||||
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||
|
||||
|
||||
CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache")
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from typing import Generator
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
|
||||
from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
from pdelfin.filter import PdfFilter
|
||||
@ -22,30 +23,7 @@ TARGET_IMAGE_DIM = 2048
|
||||
pdf_filter = PdfFilter()
|
||||
|
||||
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
|
||||
pdf = PdfReader(local_pdf_path)
|
||||
pdf_page = pdf.pages[page - 1]
|
||||
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
|
||||
|
||||
# Convert PDF page to PNG using pdftoppm
|
||||
pdftoppm_result = subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-f",
|
||||
str(page),
|
||||
"-l",
|
||||
str(page),
|
||||
"-r",
|
||||
str(TARGET_IMAGE_DIM * 72 / longest_dim),
|
||||
local_pdf_path,
|
||||
],
|
||||
timeout=120,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
||||
image_base64 = base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
||||
|
||||
image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM)
|
||||
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||
|
||||
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
|
||||
|
||||
42
pdelfin/silver_data/renderpdf.py
Normal file
42
pdelfin/silver_data/renderpdf.py
Normal file
@ -0,0 +1,42 @@
|
||||
import subprocess
|
||||
import base64
|
||||
import io
|
||||
from pypdf import PdfReader
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
|
||||
pdf = PdfReader(local_pdf_path)
|
||||
pdf_page = pdf.pages[page - 1]
|
||||
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
|
||||
|
||||
# Convert PDF page to PNG using pdftoppm
|
||||
pdftoppm_result = subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-f",
|
||||
str(page),
|
||||
"-l",
|
||||
str(page),
|
||||
"-r",
|
||||
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
||||
local_pdf_path,
|
||||
],
|
||||
timeout=120,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
||||
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
||||
|
||||
|
||||
def render_pdf_to_base64webp(local_pdf_path: str, page: int, target_longest_image_dim: int=1024):
|
||||
base64_png = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim)
|
||||
|
||||
png_image = Image.open(io.BytesIO(base64_png.encode("utf-8")))
|
||||
webp_output = io.BytesIO()
|
||||
png_image.save(webp_output, format="WEBP")
|
||||
|
||||
return base64.b64encode(webp_output.getvalue()).decode("utf-8")
|
||||
Loading…
x
Reference in New Issue
Block a user