From 66d293c1780f6535409bbd27ece9b6f2673159e7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 28 Apr 2025 15:57:20 -0700 Subject: [PATCH] Decent resume/cv tagging --- scripts/tagging_pipeline.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/tagging_pipeline.py b/scripts/tagging_pipeline.py index 60c07b7..d9f8023 100644 --- a/scripts/tagging_pipeline.py +++ b/scripts/tagging_pipeline.py @@ -71,7 +71,10 @@ metrics = MetricsKeeper(window=60 * 5) class PIIClassification(BaseModel): - is_resume_or_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv.") + primary_language: str = Field(..., description="Primary language as a two-letter code") + document_type: str = Field(..., description="Basic summary of document type classification") + is_resume_cv: Optional[bool] = Field(..., description="True if the document is a page from a resume or cv") + contains_pii: Optional[bool] = Field(..., description="True if document contains PII") async def _process_single_page(page_text: str) -> PIIClassification: @@ -90,7 +93,7 @@ async def _process_single_page(page_text: str) -> PIIClassification: "type": "text", "text": ( f"{text}\n\n-----------\n" - "Given the text above, determine if the text above is a resume (résumé) or CV. Answer in a simple JSON block." + "Given the text above, determine what type of document it is, and if it's a resume/CV. answer in JSON. The format of your json object should be {'primary_language': str, 'document_type': str, 'is_resume_cv': bool, 'contains_pii': bool}" ), } ], @@ -109,12 +112,12 @@ async def _process_single_page(page_text: str) -> PIIClassification: except Exception as e: logger.warning(f"SGLang network error: {e!s}") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) if status != 200: logger.warning(f"SGLang HTTP {status}: {body[:250]!r}") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) # ---------- Parse base JSON -------------------------------------------- try: @@ -122,7 +125,7 @@ async def _process_single_page(page_text: str) -> PIIClassification: except json.JSONDecodeError: logger.warning(f"SGLang response is not valid JSON: {body[:250]!r}") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) # Token accounting if available if "usage" in base: @@ -137,12 +140,12 @@ async def _process_single_page(page_text: str) -> PIIClassification: except (KeyError, IndexError, AttributeError) as e: logger.warning(f"Missing fields in SGLang response: {e!s}") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) if not isinstance(content, str): logger.warning("SGLang `content` is not a string; treating as error.") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) try: pii_classification: PIIClassification = PIIClassification.model_validate_json(content) @@ -150,7 +153,7 @@ async def _process_single_page(page_text: str) -> PIIClassification: except ValidationError as e: logger.warning(f"Unable to parse pii classification object: {e!s}") metrics.add_metrics(sglang_errors=1) - return PIIClassification(is_resume_or_cv=None) + return PIIClassification(primary_language="en", document_type="unknown", is_resume_cv=None, contains_pii=None) # Manual simple implementation of HTTP Post @@ -258,7 +261,7 @@ async def process_dolma_document(args, dolma_doc, sem): async with sem: pii_class = await _process_single_page(page_text) - result_attributes[key_name].append([start_pos, end_pos, pii_class.is_resume_or_cv]) + result_attributes[key_name].append([start_pos, end_pos, pii_class.is_resume_cv]) else: result_attributes[key_name].append([start_pos, end_pos, None]) @@ -400,6 +403,7 @@ async def sglang_server_task(model_name_or_path, args, semaphore): str(SGLANG_SERVER_PORT), "--log-level-http", "warning", + "--mem-fraction-static", "0.40" ] proc = await asyncio.create_subprocess_exec( @@ -493,6 +497,8 @@ async def sglang_server_host(model_name_or_path, args, semaphore): MAX_RETRIES = 5 retry = 0 + await asyncio.sleep(1000000) + while retry < MAX_RETRIES: await sglang_server_task(model_name_or_path, args, semaphore) logger.warning("SGLang server task ended")