mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 08:12:22 +00:00
Better context manager and cleanup of old browser instances
This commit is contained in:
parent
7e786c79c5
commit
780bc7d934
@ -19,6 +19,7 @@ import re
|
||||
import sqlite3
|
||||
import threading
|
||||
import unittest
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
@ -125,10 +126,6 @@ equation_cache = EquationCache()
|
||||
# --- End SQLite Cache Implementation ---
|
||||
|
||||
|
||||
# Thread-local storage for Playwright and browser instances
|
||||
_thread_local = threading.local()
|
||||
|
||||
|
||||
@dataclass
|
||||
class BoundingBox:
|
||||
x: float
|
||||
@ -158,21 +155,63 @@ def get_equation_hash(equation, bg_color="white", text_color="black", font_size=
|
||||
return hashlib.sha1(params_str.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def init_browser():
|
||||
# Thread-local storage for browser contexts
|
||||
_thread_local = threading.local()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def browser_context():
|
||||
"""
|
||||
Initialize the Playwright and browser instance for the current thread if not already done.
|
||||
Context manager for Playwright browser instances.
|
||||
Returns a browser context that can be used for rendering equations.
|
||||
Automatically handles initialization and cleanup.
|
||||
Cleans up and recreates the browser every 100 uses to prevent memory leaks.
|
||||
"""
|
||||
# Initialize usage counter if not present
|
||||
if not hasattr(_thread_local, "usage_count"):
|
||||
_thread_local.usage_count = 0
|
||||
|
||||
# Check if we already have a browser for this thread
|
||||
if not hasattr(_thread_local, "playwright"):
|
||||
_thread_local.playwright = sync_playwright().start()
|
||||
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
||||
_thread_local.usage_count = 0
|
||||
|
||||
# Increment usage counter
|
||||
_thread_local.usage_count += 1
|
||||
|
||||
def get_browser():
|
||||
"""
|
||||
Return the browser instance for the current thread.
|
||||
"""
|
||||
init_browser()
|
||||
return _thread_local.browser
|
||||
# Check if we need to clean up the browser (every 100 uses)
|
||||
if _thread_local.usage_count > 100:
|
||||
# Clean up the old browser completely
|
||||
print("Cleanup up old playwright instance to prevent memory leaks...")
|
||||
|
||||
try:
|
||||
_thread_local.browser.close()
|
||||
except Exception:
|
||||
pass # Ignore errors during cleanup
|
||||
|
||||
try:
|
||||
_thread_local.playwright.stop()
|
||||
except Exception:
|
||||
pass # Ignore errors during cleanup
|
||||
|
||||
# Remove the attributes to force re-initialization
|
||||
delattr(_thread_local, "playwright")
|
||||
delattr(_thread_local, "browser")
|
||||
|
||||
# Re-initialize with a fresh browser
|
||||
_thread_local.playwright = sync_playwright().start()
|
||||
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
||||
_thread_local.usage_count = 1 # Reset to 1 since we just used it
|
||||
|
||||
# Create a new context for this operation
|
||||
context = _thread_local.browser.new_context(viewport={"width": 800, "height": 400})
|
||||
|
||||
try:
|
||||
yield context
|
||||
finally:
|
||||
# Clean up the context after use
|
||||
context.close()
|
||||
|
||||
|
||||
def render_equation(
|
||||
@ -207,11 +246,10 @@ def render_equation(
|
||||
if not os.path.exists(katex_css_path) or not os.path.exists(katex_js_path):
|
||||
raise FileNotFoundError(f"KaTeX files not found. Please ensure katex.min.css and katex.min.js are in {script_dir}")
|
||||
|
||||
# Get the browser instance for the current thread.
|
||||
browser = get_browser()
|
||||
|
||||
# Use the browser context manager
|
||||
with browser_context() as context:
|
||||
# Create a new page.
|
||||
page = browser.new_page(viewport={"width": 800, "height": 400})
|
||||
page = context.new_page()
|
||||
|
||||
# Basic HTML structure for rendering.
|
||||
page_html = f"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user