mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-10 15:52:31 +00:00
Better context manager and cleanup of old browser instances
This commit is contained in:
parent
7e786c79c5
commit
780bc7d934
@ -19,6 +19,7 @@ import re
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
import unittest
|
import unittest
|
||||||
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
@ -125,10 +126,6 @@ equation_cache = EquationCache()
|
|||||||
# --- End SQLite Cache Implementation ---
|
# --- End SQLite Cache Implementation ---
|
||||||
|
|
||||||
|
|
||||||
# Thread-local storage for Playwright and browser instances
|
|
||||||
_thread_local = threading.local()
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BoundingBox:
|
class BoundingBox:
|
||||||
x: float
|
x: float
|
||||||
@ -158,21 +155,63 @@ def get_equation_hash(equation, bg_color="white", text_color="black", font_size=
|
|||||||
return hashlib.sha1(params_str.encode("utf-8")).hexdigest()
|
return hashlib.sha1(params_str.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def init_browser():
|
# Thread-local storage for browser contexts
|
||||||
|
_thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def browser_context():
|
||||||
"""
|
"""
|
||||||
Initialize the Playwright and browser instance for the current thread if not already done.
|
Context manager for Playwright browser instances.
|
||||||
|
Returns a browser context that can be used for rendering equations.
|
||||||
|
Automatically handles initialization and cleanup.
|
||||||
|
Cleans up and recreates the browser every 100 uses to prevent memory leaks.
|
||||||
"""
|
"""
|
||||||
|
# Initialize usage counter if not present
|
||||||
|
if not hasattr(_thread_local, "usage_count"):
|
||||||
|
_thread_local.usage_count = 0
|
||||||
|
|
||||||
|
# Check if we already have a browser for this thread
|
||||||
if not hasattr(_thread_local, "playwright"):
|
if not hasattr(_thread_local, "playwright"):
|
||||||
_thread_local.playwright = sync_playwright().start()
|
_thread_local.playwright = sync_playwright().start()
|
||||||
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
||||||
|
_thread_local.usage_count = 0
|
||||||
|
|
||||||
|
# Increment usage counter
|
||||||
|
_thread_local.usage_count += 1
|
||||||
|
|
||||||
def get_browser():
|
# Check if we need to clean up the browser (every 100 uses)
|
||||||
"""
|
if _thread_local.usage_count > 100:
|
||||||
Return the browser instance for the current thread.
|
# Clean up the old browser completely
|
||||||
"""
|
print("Cleanup up old playwright instance to prevent memory leaks...")
|
||||||
init_browser()
|
|
||||||
return _thread_local.browser
|
try:
|
||||||
|
_thread_local.browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore errors during cleanup
|
||||||
|
|
||||||
|
try:
|
||||||
|
_thread_local.playwright.stop()
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore errors during cleanup
|
||||||
|
|
||||||
|
# Remove the attributes to force re-initialization
|
||||||
|
delattr(_thread_local, "playwright")
|
||||||
|
delattr(_thread_local, "browser")
|
||||||
|
|
||||||
|
# Re-initialize with a fresh browser
|
||||||
|
_thread_local.playwright = sync_playwright().start()
|
||||||
|
_thread_local.browser = _thread_local.playwright.chromium.launch()
|
||||||
|
_thread_local.usage_count = 1 # Reset to 1 since we just used it
|
||||||
|
|
||||||
|
# Create a new context for this operation
|
||||||
|
context = _thread_local.browser.new_context(viewport={"width": 800, "height": 400})
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield context
|
||||||
|
finally:
|
||||||
|
# Clean up the context after use
|
||||||
|
context.close()
|
||||||
|
|
||||||
|
|
||||||
def render_equation(
|
def render_equation(
|
||||||
@ -207,155 +246,154 @@ def render_equation(
|
|||||||
if not os.path.exists(katex_css_path) or not os.path.exists(katex_js_path):
|
if not os.path.exists(katex_css_path) or not os.path.exists(katex_js_path):
|
||||||
raise FileNotFoundError(f"KaTeX files not found. Please ensure katex.min.css and katex.min.js are in {script_dir}")
|
raise FileNotFoundError(f"KaTeX files not found. Please ensure katex.min.css and katex.min.js are in {script_dir}")
|
||||||
|
|
||||||
# Get the browser instance for the current thread.
|
# Use the browser context manager
|
||||||
browser = get_browser()
|
with browser_context() as context:
|
||||||
|
# Create a new page.
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
# Create a new page.
|
# Basic HTML structure for rendering.
|
||||||
page = browser.new_page(viewport={"width": 800, "height": 400})
|
page_html = f"""
|
||||||
|
<!DOCTYPE html>
|
||||||
# Basic HTML structure for rendering.
|
<html>
|
||||||
page_html = f"""
|
<head>
|
||||||
<!DOCTYPE html>
|
<style>
|
||||||
<html>
|
body {{
|
||||||
<head>
|
display: flex;
|
||||||
<style>
|
justify-content: center;
|
||||||
body {{
|
align-items: center;
|
||||||
display: flex;
|
height: 100vh;
|
||||||
justify-content: center;
|
margin: 0;
|
||||||
align-items: center;
|
background-color: {bg_color};
|
||||||
height: 100vh;
|
color: {text_color};
|
||||||
margin: 0;
|
}}
|
||||||
background-color: {bg_color};
|
#equation-container {{
|
||||||
color: {text_color};
|
padding: 0;
|
||||||
}}
|
font-size: {font_size}px;
|
||||||
#equation-container {{
|
}}
|
||||||
padding: 0;
|
</style>
|
||||||
font-size: {font_size}px;
|
</head>
|
||||||
}}
|
<body>
|
||||||
</style>
|
<div id="equation-container"></div>
|
||||||
</head>
|
</body>
|
||||||
<body>
|
</html>
|
||||||
<div id="equation-container"></div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
page.set_content(page_html)
|
|
||||||
page.add_style_tag(path=katex_css_path)
|
|
||||||
page.add_script_tag(path=katex_js_path)
|
|
||||||
page.wait_for_load_state("networkidle")
|
|
||||||
|
|
||||||
katex_loaded = page.evaluate("typeof katex !== 'undefined'")
|
|
||||||
if not katex_loaded:
|
|
||||||
page.close()
|
|
||||||
raise RuntimeError("KaTeX library failed to load. Check your katex.min.js file.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
error_message = page.evaluate(
|
|
||||||
f"""
|
|
||||||
() => {{
|
|
||||||
try {{
|
|
||||||
katex.render({escaped_equation}, document.getElementById("equation-container"), {{
|
|
||||||
displayMode: true,
|
|
||||||
throwOnError: true
|
|
||||||
}});
|
|
||||||
return null;
|
|
||||||
}} catch (error) {{
|
|
||||||
console.error("KaTeX error:", error.message);
|
|
||||||
return error.message;
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
"""
|
"""
|
||||||
)
|
page.set_content(page_html)
|
||||||
except PlaywrightError as ex:
|
page.add_style_tag(path=katex_css_path)
|
||||||
print(escaped_equation)
|
page.add_script_tag(path=katex_js_path)
|
||||||
error_message = str(ex)
|
page.wait_for_load_state("networkidle")
|
||||||
page.close()
|
|
||||||
raise
|
|
||||||
|
|
||||||
if error_message:
|
katex_loaded = page.evaluate("typeof katex !== 'undefined'")
|
||||||
print(f"Error rendering equation: '{equation}'")
|
if not katex_loaded:
|
||||||
print(error_message)
|
page.close()
|
||||||
# Cache the error result so we don't retry it next time.
|
raise RuntimeError("KaTeX library failed to load. Check your katex.min.js file.")
|
||||||
rendered_eq = RenderedEquation(mathml=error_message, spans=[], error=error_message)
|
|
||||||
if use_cache:
|
|
||||||
equation_cache.save(eq_hash, rendered_eq)
|
|
||||||
page.close()
|
|
||||||
return rendered_eq
|
|
||||||
|
|
||||||
page.wait_for_selector(".katex", state="attached")
|
try:
|
||||||
|
error_message = page.evaluate(
|
||||||
|
f"""
|
||||||
|
() => {{
|
||||||
|
try {{
|
||||||
|
katex.render({escaped_equation}, document.getElementById("equation-container"), {{
|
||||||
|
displayMode: true,
|
||||||
|
throwOnError: true
|
||||||
|
}});
|
||||||
|
return null;
|
||||||
|
}} catch (error) {{
|
||||||
|
console.error("KaTeX error:", error.message);
|
||||||
|
return error.message;
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
except PlaywrightError as ex:
|
||||||
|
print(escaped_equation)
|
||||||
|
error_message = str(ex)
|
||||||
|
page.close()
|
||||||
|
raise
|
||||||
|
|
||||||
if debug_dom:
|
if error_message:
|
||||||
katex_dom_html = page.evaluate(
|
print(f"Error rendering equation: '{equation}'")
|
||||||
|
print(error_message)
|
||||||
|
# Cache the error result so we don't retry it next time.
|
||||||
|
rendered_eq = RenderedEquation(mathml=error_message, spans=[], error=error_message)
|
||||||
|
if use_cache:
|
||||||
|
equation_cache.save(eq_hash, rendered_eq)
|
||||||
|
page.close()
|
||||||
|
return rendered_eq
|
||||||
|
|
||||||
|
page.wait_for_selector(".katex", state="attached")
|
||||||
|
|
||||||
|
if debug_dom:
|
||||||
|
katex_dom_html = page.evaluate(
|
||||||
|
"""
|
||||||
|
() => {
|
||||||
|
return document.getElementById("equation-container").innerHTML;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
print("\n===== KaTeX DOM HTML =====")
|
||||||
|
print(katex_dom_html)
|
||||||
|
|
||||||
|
# Extract inner-most spans with non-whitespace text.
|
||||||
|
spans_info = page.evaluate(
|
||||||
"""
|
"""
|
||||||
() => {
|
() => {
|
||||||
return document.getElementById("equation-container").innerHTML;
|
const spans = Array.from(document.querySelectorAll('span'));
|
||||||
|
const list = [];
|
||||||
|
spans.forEach(span => {
|
||||||
|
if (span.children.length === 0 && /\\S/.test(span.textContent)) {
|
||||||
|
const rect = span.getBoundingClientRect();
|
||||||
|
list.push({
|
||||||
|
text: span.textContent.trim(),
|
||||||
|
boundingBox: {
|
||||||
|
x: rect.x,
|
||||||
|
y: rect.y,
|
||||||
|
width: rect.width,
|
||||||
|
height: rect.height
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return list;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
print("\n===== KaTeX DOM HTML =====")
|
|
||||||
print(katex_dom_html)
|
|
||||||
|
|
||||||
# Extract inner-most spans with non-whitespace text.
|
if debug_dom:
|
||||||
spans_info = page.evaluate(
|
print("\n===== Extracted Span Information =====")
|
||||||
|
print(spans_info)
|
||||||
|
|
||||||
|
# Extract MathML output (if available) from the KaTeX output.
|
||||||
|
mathml = page.evaluate(
|
||||||
|
"""
|
||||||
|
() => {
|
||||||
|
const mathElem = document.querySelector('.katex-mathml math');
|
||||||
|
return mathElem ? mathElem.outerHTML : "";
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
() => {
|
)
|
||||||
const spans = Array.from(document.querySelectorAll('span'));
|
|
||||||
const list = [];
|
|
||||||
spans.forEach(span => {
|
|
||||||
if (span.children.length === 0 && /\\S/.test(span.textContent)) {
|
|
||||||
const rect = span.getBoundingClientRect();
|
|
||||||
list.push({
|
|
||||||
text: span.textContent.trim(),
|
|
||||||
boundingBox: {
|
|
||||||
x: rect.x,
|
|
||||||
y: rect.y,
|
|
||||||
width: rect.width,
|
|
||||||
height: rect.height
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
if debug_dom:
|
page.close()
|
||||||
print("\n===== Extracted Span Information =====")
|
|
||||||
print(spans_info)
|
|
||||||
|
|
||||||
# Extract MathML output (if available) from the KaTeX output.
|
rendered_eq = RenderedEquation(
|
||||||
mathml = page.evaluate(
|
mathml=mathml,
|
||||||
"""
|
spans=[
|
||||||
() => {
|
SpanInfo(
|
||||||
const mathElem = document.querySelector('.katex-mathml math');
|
text=s["text"],
|
||||||
return mathElem ? mathElem.outerHTML : "";
|
bounding_box=BoundingBox(
|
||||||
}
|
x=s["boundingBox"]["x"],
|
||||||
"""
|
y=s["boundingBox"]["y"],
|
||||||
)
|
width=s["boundingBox"]["width"],
|
||||||
|
height=s["boundingBox"]["height"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for s in spans_info
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
page.close()
|
# Save the successfully rendered equation to the SQLite cache.
|
||||||
|
if use_cache:
|
||||||
rendered_eq = RenderedEquation(
|
equation_cache.save(eq_hash, rendered_eq)
|
||||||
mathml=mathml,
|
return rendered_eq
|
||||||
spans=[
|
|
||||||
SpanInfo(
|
|
||||||
text=s["text"],
|
|
||||||
bounding_box=BoundingBox(
|
|
||||||
x=s["boundingBox"]["x"],
|
|
||||||
y=s["boundingBox"]["y"],
|
|
||||||
width=s["boundingBox"]["width"],
|
|
||||||
height=s["boundingBox"]["height"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
for s in spans_info
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save the successfully rendered equation to the SQLite cache.
|
|
||||||
if use_cache:
|
|
||||||
equation_cache.save(eq_hash, rendered_eq)
|
|
||||||
return rendered_eq
|
|
||||||
|
|
||||||
|
|
||||||
def compare_rendered_equations(reference: RenderedEquation, hypothesis: RenderedEquation) -> bool:
|
def compare_rendered_equations(reference: RenderedEquation, hypothesis: RenderedEquation) -> bool:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user