More stats hopefully running faster

2026-01-04 03:04:45 +00:00 · 2024-10-14 21:37:14 +00:00 · 2024-10-14 21:37:14 +00:00 · 6d53683001
commit 6d53683001
parent 350061906e
4 changed files with 71 additions and 20 deletions
--- a/pdelfin/birrpipeline.py
+++ b/pdelfin/birrpipeline.py
@ -496,8 +496,17 @@ def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> list
                if any(page.is_usable() and page.page_num == target_page_num for page in existing_pages):
                    continue

-                # TODO: Later, you may want to retry with different sampling parameters or do something else
-                new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+                has_errored_previously = sum(page.page_num == target_page_num for page in existing_pages)
+
+                if has_errored_previously:
+                    # TODO For now this just retries the page 3 times, which is nothing special
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+
+                    # But you can try to do some fancier things, such as rotating the page, removing the pdf hints all together, etc
+                else:
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
    except Exception as ex:
        print(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")

@ -670,8 +679,6 @@ if __name__ == '__main__':
    # For each round, outputs a report of how many pages were processed, how many had errors, and a breakdown by (error, finish_reason)
    total_rounds = db.get_last_indexed_round() + 1
    for round_num in range(total_rounds):
-        print(f"\nStatistics for round {round_num}:")
-        
        db.cursor.execute("""
            SELECT COUNT(*), error, finish_reason
            FROM page_results
@ -682,13 +689,12 @@ if __name__ == '__main__':
        results = db.cursor.fetchall()
        
        total_pages = sum(count for count, _, _ in results)
-        print(f"Total pages processed: {total_pages:,}")
+        print(f"\nInference Round {round_num} - {total_pages:,} pages processed:")

        for count, error, finish_reason in results:
            error_str = error if error is not None else "None"
            print(f"  (error: {error_str}, finish_reason: {finish_reason}) -> {count:,} pages")

-
    print("\nWork finished, waiting for all workers to finish cleaning up")
    executor.shutdown(wait=True)
    db.close()
--- a/pdelfin/data/renderpdf.py
+++ b/pdelfin/data/renderpdf.py
@ -2,14 +2,42 @@ import subprocess
 import base64
 import io
 from pypdf import PdfReader
-
 from PIL import Image


-def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
-    pdf = PdfReader(local_pdf_path)
-    pdf_page = pdf.pages[page - 1]
-    longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
+def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
+    """
+    Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
+
+    :param pdf_file: Path to the PDF file
+    :param page_num: The page number for which to extract MediaBox dimensions
+    :return: A dictionary containing MediaBox dimensions or None if not found
+    """
+    # Construct the pdfinfo command to extract info for the specific page
+    command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
+    
+    # Run the command using subprocess
+    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    
+    # Check if there is any error in executing the command
+    if result.returncode != 0:
+        raise ValueError(f"Error running pdfinfo: {result.stderr}")
+    
+    # Parse the output to find MediaBox
+    output = result.stdout
+    media_box = None
+    
+    for line in output.splitlines():
+        if 'MediaBox' in line:
+            media_box = line.split(':')[1].strip().split()
+            media_box = [float(x) for x in media_box]
+            return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
+    
+    raise ValueError("MediaBox not found in the PDF info.")
+    
+
+def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int=2048):
+    longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))

    # Convert PDF page to PNG using pdftoppm
    pdftoppm_result = subprocess.run(
@ -17,9 +45,9 @@ def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image
            "pdftoppm",
            "-png",
            "-f",
-            str(page),
+            str(page_num),
            "-l",
-            str(page),
+            str(page_num),
            "-r",
            str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
            local_pdf_path,
--- a/pdelfin/prompts/anchor.py
+++ b/pdelfin/prompts/anchor.py
@ -13,6 +13,7 @@ import re
 import ftfy
 from dataclasses import dataclass
 from typing import Literal, List
+from functools import lru_cache

 import pypdfium2 as pdfium
 import pymupdf
@ -119,10 +120,14 @@ class PageReport:
    text_elements: List[TextElement]
    image_elements: List[ImageElement]

+@lru_cache(maxsize=5)
+def _get_cached_pdf_reader(local_pdf_path: str) -> PdfReader:
+    # Cached, because you are going to often iterate through a whole pdf, so this will make it a lot faster on subsequent iterations
+    return PdfReader(local_pdf_path)

-def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
-    reader = PdfReader(local_pdf_path)
-    page = reader.pages[page - 1]
+def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
+    reader = _get_cached_pdf_reader(local_pdf_path)
+    page = reader.pages[page_num - 1]
    resources = page.get("/Resources", {})
    xobjects = resources.get("/XObject", {})
    text_elements, image_elements = [], []
--- a/tests/test_anchor.py
+++ b/tests/test_anchor.py
@ -2,11 +2,12 @@ import unittest
 import os
 import json
 import io
+import glob

 from pypdf import PdfReader

 from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
-
+from pdelfin.data.renderpdf import get_pdf_media_box_width_height

 class AnchorTest(unittest.TestCase):
    def testExtractText(self):
@ -103,8 +104,6 @@ class AnchorTest(unittest.TestCase):
        self.assertLess(len(anchor_text), 4000)


-
-
 class BuildSilverTest(unittest.TestCase):
    def testSmallPage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
@ -124,4 +123,17 @@ class BuildSilverTest(unittest.TestCase):

        print(width, height)

-        assert max(width, height) == 2048
+        assert max(width, height) == 2048
+
+class TestRenderPdf(unittest.TestCase):
+    def testFastMediaBoxMatchesPyPdf(self):
+        for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):
+            reader = PdfReader(file)
+            print("checking", file)
+            
+            for page_num in range(1, len(reader.pages) + 1):
+                w1, h1 = get_pdf_media_box_width_height(file, page_num)
+                pypdfpage = reader.pages[page_num - 1]
+
+                self.assertEqual(w1, pypdfpage.mediabox.width)
+                self.assertEqual(h1, pypdfpage.mediabox.height)