new version of sglang, server restarts, semaphore timeouts

2025-11-05 20:38:40 +00:00 · 2024-11-12 10:49:13 -08:00 · 2024-11-12 10:49:13 -08:00 · 102c0e4cfc
commit 102c0e4cfc
parent 918e2f3542
2 changed files with 40 additions and 8 deletions
--- a/pdelfin/beakerpipeline.py
+++ b/pdelfin/beakerpipeline.py
@ -237,7 +237,9 @@ async def process_page(args, session: aiohttp.ClientSession, pdf_s3_path: str, p
    COMPLETION_URL = "http://localhost:30000/v1/chat/completions"
    MAX_RETRIES = 3
    
-    for attempt in range(1, MAX_RETRIES + 1):
+    attempt = 0
+
+    while attempt < MAX_RETRIES:
        query = await build_page_query(
            pdf_local_path,
            page_num,
@ -267,11 +269,20 @@ async def process_page(args, session: aiohttp.ClientSession, pdf_s3_path: str, p
                    output_tokens=base_response_data["usage"].get("completion_tokens", 0)
                )
        except aiohttp.ClientError as e:
-            logger.warning(f"Client error on attempt {attempt} for {pdf_s3_path}-{page_num}:: {e}")
+            logger.warning(f"Client error on attempt {attempt} for {pdf_s3_path}-{page_num}: {e}")
+            
+            # Now we want to do exponential backoff, and not count this as an actual page retry
+            # Page retrys are supposed to be for fixing bad results from the model, but actual requests to sglang 
+            # are supposed to work. Probably this means that the server is just restarting
+            logger.info(f"Sleeping for 5 seconds on {pdf_s3_path}-{page_num} to allow server restart")
+            await asyncio.sleep(5)
+
        except json.JSONDecodeError as e:
            logger.warning(f"JSON decode error on attempt {attempt} for {pdf_s3_path}-{page_num}: {e}")
+            attempt += 1
        except Exception as e:
-            logger.warning(f"Unexpected error on attempt {attempt} for {pdf_s3_path}-{page_num}:: {e}")
+            logger.warning(f"Unexpected error on attempt {attempt} for {pdf_s3_path}-{page_num}: {e}")
+            attempt += 1

        if attempt >= MAX_RETRIES:
            logger.error(f"Failed to process {pdf_s3_path}-{page_num} after {MAX_RETRIES} attempts.")
@ -429,25 +440,41 @@ async def sglang_server_task(args, semaphore):

    atexit.register(_kill_proc)

-    last_queue_req = None  # To track transitions
+    last_running_req, last_queue_req = 0, 0 # To track transitions
+    can_release_automatically = False
+    last_semaphore_release = time.time()
    async def process_line(line):
        sglang_logger.info(line)

+        match = re.search(r'#running-req: (\d+)', line)
+        if match:
+            last_running_req = int(match.group(1))
+
+            if last_running_req > 0:
+                can_release_automatically = True
+        
        # Parse the line and update semaphore if necessary
        match = re.search(r'#queue-req: (\d+)', line)
        if match:
            queue_req = int(match.group(1))
-            logger.info(f"sglang queue req: {queue_req}")
+            logger.info(f"sglang running req: {last_running_req} queue req: {queue_req}")

            nonlocal last_queue_req
-            if last_queue_req is not None and last_queue_req != 0 and queue_req == 0:
+            if last_queue_req != 0 and queue_req == 0:
                # Release the semaphore when queue_req transitions from non-zero to zero
                if semaphore.locked():
                    semaphore.release()
+                    last_semaphore_release = time.time()
                    logger.info("Semaphore released, allowing a worker to proceed.")

            last_queue_req = queue_req

+        # And have a semaphore release automatically if there are no running requests for > 30 seconds
+        if last_running_req == 0 and time.time() - last_semaphore_release > 30 and semaphore.locked():
+            semaphore.release()
+            last_semaphore_release = time.time()
+            logger.info("Semaphore released due to timeout, allowing a worker to proceed.")
+
    async def read_stream(stream):
        while True:
            line = await stream.readline()
@ -465,6 +492,11 @@ async def sglang_server_task(args, semaphore):
    await stderr_task


+async def sglang_server_host(args, semaphore):
+    while True:
+        await sglang_server_task(args, semaphore)
+
+
 async def sglang_server_ready():
    max_attempts = 300
    delay_sec = 1
@ -528,7 +560,7 @@ async def main():
    # As soon as one worker is no longer saturating the gpu, the next one can start sending requests
    semaphore = asyncio.Semaphore(1)

-    sglang_server = asyncio.create_task(sglang_server_task(args, semaphore))
+    sglang_server = asyncio.create_task(sglang_server_host(args, semaphore))

    work_queue = await load_pdf_work_queue(args)
    logger.info(f"Work queue prepared with {work_queue.qsize()} items")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -68,7 +68,7 @@ dev = [
 ]

 inference = [
-    "sglang[all]"
+    { git = "https://github.com/sgl-project/sglang.git", rev = "eff468dd5a3d24646560eb044276585f7a11ac3c", subdirectory = "python", extras = ["all"] }
 ]

 train = [