mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 08:12:22 +00:00
Ok, should have a cleaner playwright story on multithreaded environments
This commit is contained in:
parent
1a64728420
commit
4e77ecac3f
@ -19,6 +19,7 @@ import re
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
import unittest
|
import unittest
|
||||||
|
import weakref
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -159,61 +160,61 @@ def get_equation_hash(equation, bg_color="white", text_color="black", font_size=
|
|||||||
_thread_local = threading.local()
|
_thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_playwright(playwright, browser):
|
||||||
|
print(f"Cleaning up playwright context on {threading.get_ident()} native id {threading.get_native_id()}, on PID {os.getpid()}, {threading.active_count()} active threads")
|
||||||
|
|
||||||
|
try:
|
||||||
|
browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore errors during cleanup
|
||||||
|
|
||||||
|
try:
|
||||||
|
playwright.stop()
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore errors during cleanup
|
||||||
|
|
||||||
|
|
||||||
|
class _BrowserOwner:
|
||||||
|
def __init__(self):
|
||||||
|
p = sync_playwright().start()
|
||||||
|
b = p.chromium.launch()
|
||||||
|
self.p = p
|
||||||
|
self.browser = b
|
||||||
|
self._closed = False
|
||||||
|
# Important: don't capture `self` or globals in the finalizer
|
||||||
|
self._finalizer = weakref.finalize(self, _cleanup_playwright, p, b)
|
||||||
|
|
||||||
|
def close_now(self):
|
||||||
|
if not self._closed:
|
||||||
|
self._closed = True
|
||||||
|
self._finalizer() # idempotent; runs at most once
|
||||||
|
|
||||||
|
|
||||||
|
def _get_owner():
|
||||||
|
owner = getattr(_thread_local, "owner", None)
|
||||||
|
if owner is None:
|
||||||
|
owner = _BrowserOwner()
|
||||||
|
_thread_local.owner = owner
|
||||||
|
return owner
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def browser_context():
|
def browser_context():
|
||||||
"""
|
"""
|
||||||
Context manager for Playwright browser instances.
|
Context manager for Playwright browser instances.
|
||||||
Returns a browser context that can be used for rendering equations.
|
Returns a browser context that can be used for rendering equations.
|
||||||
Automatically handles initialization and cleanup.
|
Automatically handles initialization and cleanup.
|
||||||
Cleans up and recreates the browser every 100 uses to prevent memory leaks.
|
|
||||||
"""
|
"""
|
||||||
# Initialize usage counter if not present
|
|
||||||
if not hasattr(_thread_local, "usage_count"):
|
|
||||||
_thread_local.usage_count = 0
|
|
||||||
|
|
||||||
# Check if we already have a browser for this thread
|
|
||||||
if not hasattr(_thread_local, "playwright"):
|
|
||||||
_thread_local.playwright = sync_playwright().start()
|
|
||||||
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
|
||||||
_thread_local.usage_count = 0
|
|
||||||
|
|
||||||
# Increment usage counter
|
|
||||||
_thread_local.usage_count += 1
|
|
||||||
|
|
||||||
print(f"DEBUG: Initializing playwright context on {threading.get_ident()} native id {threading.get_native_id()}, on PID {os.getpid()}, {threading.active_count()} active threads, {_thread_local.usage_count}")
|
|
||||||
|
|
||||||
# Check if we need to clean up the browser (every 100 uses)
|
|
||||||
if _thread_local.usage_count > 100:
|
|
||||||
# Clean up the old browser completely
|
|
||||||
print("Cleanup up old playwright instance to prevent memory leaks...")
|
|
||||||
|
|
||||||
|
owner = _get_owner()
|
||||||
|
ctx = owner.browser.new_context(viewport={"width": 800, "height": 400})
|
||||||
try:
|
try:
|
||||||
_thread_local.browser.close()
|
yield ctx
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors during cleanup
|
|
||||||
|
|
||||||
try:
|
|
||||||
_thread_local.playwright.stop()
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors during cleanup
|
|
||||||
|
|
||||||
# Remove the attributes to force re-initialization
|
|
||||||
delattr(_thread_local, "playwright")
|
|
||||||
delattr(_thread_local, "browser")
|
|
||||||
|
|
||||||
# Re-initialize with a fresh browser
|
|
||||||
_thread_local.playwright = sync_playwright().start()
|
|
||||||
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
|
||||||
_thread_local.usage_count = 1 # Reset to 1 since we just used it
|
|
||||||
|
|
||||||
# Create a new context for this operation
|
|
||||||
context = _thread_local.browser.new_context(viewport={"width": 800, "height": 400})
|
|
||||||
|
|
||||||
try:
|
|
||||||
yield context
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up the context after use
|
try:
|
||||||
context.close()
|
ctx.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def render_equation(
|
def render_equation(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user