mirror of
https://github.com/allenai/olmocr.git
synced 2026-01-04 03:04:45 +00:00
More stats hopefully running faster
This commit is contained in:
parent
350061906e
commit
6d53683001
@ -496,8 +496,17 @@ def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> list
|
||||
if any(page.is_usable() and page.page_num == target_page_num for page in existing_pages):
|
||||
continue
|
||||
|
||||
# TODO: Later, you may want to retry with different sampling parameters or do something else
|
||||
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
|
||||
has_errored_previously = sum(page.page_num == target_page_num for page in existing_pages)
|
||||
|
||||
if has_errored_previously:
|
||||
# TODO For now this just retries the page 3 times, which is nothing special
|
||||
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
|
||||
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
|
||||
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
|
||||
|
||||
# But you can try to do some fancier things, such as rotating the page, removing the pdf hints all together, etc
|
||||
else:
|
||||
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
|
||||
except Exception as ex:
|
||||
print(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")
|
||||
|
||||
@ -670,8 +679,6 @@ if __name__ == '__main__':
|
||||
# For each round, outputs a report of how many pages were processed, how many had errors, and a breakdown by (error, finish_reason)
|
||||
total_rounds = db.get_last_indexed_round() + 1
|
||||
for round_num in range(total_rounds):
|
||||
print(f"\nStatistics for round {round_num}:")
|
||||
|
||||
db.cursor.execute("""
|
||||
SELECT COUNT(*), error, finish_reason
|
||||
FROM page_results
|
||||
@ -682,13 +689,12 @@ if __name__ == '__main__':
|
||||
results = db.cursor.fetchall()
|
||||
|
||||
total_pages = sum(count for count, _, _ in results)
|
||||
print(f"Total pages processed: {total_pages:,}")
|
||||
print(f"\nInference Round {round_num} - {total_pages:,} pages processed:")
|
||||
|
||||
for count, error, finish_reason in results:
|
||||
error_str = error if error is not None else "None"
|
||||
print(f" (error: {error_str}, finish_reason: {finish_reason}) -> {count:,} pages")
|
||||
|
||||
|
||||
print("\nWork finished, waiting for all workers to finish cleaning up")
|
||||
executor.shutdown(wait=True)
|
||||
db.close()
|
||||
|
||||
@ -2,14 +2,42 @@ import subprocess
|
||||
import base64
|
||||
import io
|
||||
from pypdf import PdfReader
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image_dim: int=2048):
|
||||
pdf = PdfReader(local_pdf_path)
|
||||
pdf_page = pdf.pages[page - 1]
|
||||
longest_dim = max(pdf_page.mediabox.width, pdf_page.mediabox.height)
|
||||
def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
|
||||
"""
|
||||
Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
|
||||
|
||||
:param pdf_file: Path to the PDF file
|
||||
:param page_num: The page number for which to extract MediaBox dimensions
|
||||
:return: A dictionary containing MediaBox dimensions or None if not found
|
||||
"""
|
||||
# Construct the pdfinfo command to extract info for the specific page
|
||||
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
|
||||
|
||||
# Run the command using subprocess
|
||||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
# Check if there is any error in executing the command
|
||||
if result.returncode != 0:
|
||||
raise ValueError(f"Error running pdfinfo: {result.stderr}")
|
||||
|
||||
# Parse the output to find MediaBox
|
||||
output = result.stdout
|
||||
media_box = None
|
||||
|
||||
for line in output.splitlines():
|
||||
if 'MediaBox' in line:
|
||||
media_box = line.split(':')[1].strip().split()
|
||||
media_box = [float(x) for x in media_box]
|
||||
return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
|
||||
|
||||
raise ValueError("MediaBox not found in the PDF info.")
|
||||
|
||||
|
||||
def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int=2048):
|
||||
longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
|
||||
|
||||
# Convert PDF page to PNG using pdftoppm
|
||||
pdftoppm_result = subprocess.run(
|
||||
@ -17,9 +45,9 @@ def render_pdf_to_base64png(local_pdf_path: str, page: int, target_longest_image
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-f",
|
||||
str(page),
|
||||
str(page_num),
|
||||
"-l",
|
||||
str(page),
|
||||
str(page_num),
|
||||
"-r",
|
||||
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
||||
local_pdf_path,
|
||||
|
||||
@ -13,6 +13,7 @@ import re
|
||||
import ftfy
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, List
|
||||
from functools import lru_cache
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pymupdf
|
||||
@ -119,10 +120,14 @@ class PageReport:
|
||||
text_elements: List[TextElement]
|
||||
image_elements: List[ImageElement]
|
||||
|
||||
@lru_cache(maxsize=5)
|
||||
def _get_cached_pdf_reader(local_pdf_path: str) -> PdfReader:
|
||||
# Cached, because you are going to often iterate through a whole pdf, so this will make it a lot faster on subsequent iterations
|
||||
return PdfReader(local_pdf_path)
|
||||
|
||||
def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
|
||||
reader = PdfReader(local_pdf_path)
|
||||
page = reader.pages[page - 1]
|
||||
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
|
||||
reader = _get_cached_pdf_reader(local_pdf_path)
|
||||
page = reader.pages[page_num - 1]
|
||||
resources = page.get("/Resources", {})
|
||||
xobjects = resources.get("/XObject", {})
|
||||
text_elements, image_elements = [], []
|
||||
|
||||
@ -2,11 +2,12 @@ import unittest
|
||||
import os
|
||||
import json
|
||||
import io
|
||||
import glob
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
||||
|
||||
from pdelfin.data.renderpdf import get_pdf_media_box_width_height
|
||||
|
||||
class AnchorTest(unittest.TestCase):
|
||||
def testExtractText(self):
|
||||
@ -103,8 +104,6 @@ class AnchorTest(unittest.TestCase):
|
||||
self.assertLess(len(anchor_text), 4000)
|
||||
|
||||
|
||||
|
||||
|
||||
class BuildSilverTest(unittest.TestCase):
|
||||
def testSmallPage(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
||||
@ -124,4 +123,17 @@ class BuildSilverTest(unittest.TestCase):
|
||||
|
||||
print(width, height)
|
||||
|
||||
assert max(width, height) == 2048
|
||||
assert max(width, height) == 2048
|
||||
|
||||
class TestRenderPdf(unittest.TestCase):
|
||||
def testFastMediaBoxMatchesPyPdf(self):
|
||||
for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):
|
||||
reader = PdfReader(file)
|
||||
print("checking", file)
|
||||
|
||||
for page_num in range(1, len(reader.pages) + 1):
|
||||
w1, h1 = get_pdf_media_box_width_height(file, page_num)
|
||||
pypdfpage = reader.pages[page_num - 1]
|
||||
|
||||
self.assertEqual(w1, pypdfpage.mediabox.width)
|
||||
self.assertEqual(h1, pypdfpage.mediabox.height)
|
||||
Loading…
x
Reference in New Issue
Block a user