More stats hopefully running faster

This commit is contained in:
Jake Poznanski 2024-10-14 21:37:14 +00:00
parent 350061906e
commit 6d53683001
4 changed files with 71 additions and 20 deletions

View File

@ -496,8 +496,17 @@ def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> list
if any(page.is_usable() and page.page_num == target_page_num for page in existing_pages):
continue
# TODO: Later, you may want to retry with different sampling parameters or do something else
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
has_errored_previously = sum(page.page_num == target_page_num for page in existing_pages)
if has_errored_previously:
# TODO For now this just retries the page 3 times, which is nothing special
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
# But you can try to do some fancier things, such as rotating the page, removing the pdf hints all together, etc
else:
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
except Exception as ex:
print(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")
@ -670,8 +679,6 @@ if __name__ == '__main__':
# For each round, outputs a report of how many pages were processed, how many had errors, and a breakdown by (error, finish_reason)
total_rounds = db.get_last_indexed_round() + 1
for round_num in range(total_rounds):
print(f"\nStatistics for round {round_num}:")
db.cursor.execute("""
SELECT COUNT(*), error, finish_reason
FROM page_results
@ -682,13 +689,12 @@ if __name__ == '__main__':
results = db.cursor.fetchall()
total_pages = sum(count for count, _, _ in results)
print(f"Total pages processed: {total_pages:,}")
print(f"\nInference Round {round_num} - {total_pages:,} pages processed:")
for count, error, finish_reason in results:
error_str = error if error is not None else "None"
print(f" (error: {error_str}, finish_reason: {finish_reason}) -> {count:,} pages")
print("\nWork finished, waiting for all workers to finish cleaning up")
executor.shutdown(wait=True)
db.close()

View File

@ -2,14 +2,42 @@ import subprocess
import base64
import io
from pypdf import PdfReader
from PIL import Image
def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
pdf = PdfReader(local_pdf_path)
pdf_page = pdf.pages[page - 1]
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
"""
Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
:param pdf_file: Path to the PDF file
:param page_num: The page number for which to extract MediaBox dimensions
:return: A dictionary containing MediaBox dimensions or None if not found
"""
# Construct the pdfinfo command to extract info for the specific page
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
# Run the command using subprocess
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# Check if there is any error in executing the command
if result.returncode != 0:
raise ValueError(f"Error running pdfinfo: {result.stderr}")
# Parse the output to find MediaBox
output = result.stdout
media_box = None
for line in output.splitlines():
if 'MediaBox' in line:
media_box = line.split(':')[1].strip().split()
media_box = [float(x) for x in media_box]
return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
raise ValueError("MediaBox not found in the PDF info.")
def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int=2048):
longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
# Convert PDF page to PNG using pdftoppm
pdftoppm_result = subprocess.run(
@ -17,9 +45,9 @@ def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image
"pdftoppm",
"-png",
"-f",
str(page),
str(page_num),
"-l",
str(page),
str(page_num),
"-r",
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
local_pdf_path,

View File

@ -13,6 +13,7 @@ import re
import ftfy
from dataclasses import dataclass
from typing import Literal, List
from functools import lru_cache
import pypdfium2 as pdfium
import pymupdf
@ -119,10 +120,14 @@ class PageReport:
text_elements: List[TextElement]
image_elements: List[ImageElement]
@lru_cache(maxsize=5)
def _get_cached_pdf_reader(local_pdf_path: str) -> PdfReader:
# Cached, because you are going to often iterate through a whole pdf, so this will make it a lot faster on subsequent iterations
return PdfReader(local_pdf_path)
def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
reader = PdfReader(local_pdf_path)
page = reader.pages[page - 1]
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
reader = _get_cached_pdf_reader(local_pdf_path)
page = reader.pages[page_num - 1]
resources = page.get("/Resources", {})
xobjects = resources.get("/XObject", {})
text_elements, image_elements = [], []

View File

@ -2,11 +2,12 @@ import unittest
import os
import json
import io
import glob
from pypdf import PdfReader
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
from pdelfin.data.renderpdf import get_pdf_media_box_width_height
class AnchorTest(unittest.TestCase):
def testExtractText(self):
@ -103,8 +104,6 @@ class AnchorTest(unittest.TestCase):
self.assertLess(len(anchor_text), 4000)
class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
@ -124,4 +123,17 @@ class BuildSilverTest(unittest.TestCase):
print(width, height)
assert max(width, height) == 2048
assert max(width, height) == 2048
class TestRenderPdf(unittest.TestCase):
def testFastMediaBoxMatchesPyPdf(self):
for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):
reader = PdfReader(file)
print("checking", file)
for page_num in range(1, len(reader.pages) + 1):
w1, h1 = get_pdf_media_box_width_height(file, page_num)
pypdfpage = reader.pages[page_num - 1]
self.assertEqual(w1, pypdfpage.mediabox.width)
self.assertEqual(h1, pypdfpage.mediabox.height)