Some qwen25 work

2025-06-27 04:00:02 +00:00 · 2025-05-14 20:59:22 +00:00 · 2025-05-14 20:59:22 +00:00 · 725aa834fb
commit 725aa834fb
parent a85571c047
3 changed files with 39 additions and 22 deletions
--- a/olmocr/bench/runners/run_server.py
+++ b/olmocr/bench/runners/run_server.py
@ -22,6 +22,7 @@ async def run_server(
    target_longest_image_dim: int = 1024,
    prompt_template: Literal["full", "basic", "finetune"] = "finetune",
    response_template: Literal["plain", "json"] = "json",
+    prompt_image_first: bool = False,
 ) -> str:
    """
    Convert page of a PDF file to markdown by calling a request
@ -48,20 +49,36 @@ async def run_server(
    else:
        raise ValueError("Unknown prompt template")

-    request = {
-        "model": model,
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                ],
-            }
-        ],
-        "temperature": temperature,
-        "max_tokens": 3000,
-    }
+    if prompt_image_first:
+        request = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                        {"type": "text", "text": prompt},
+                    ],
+                }
+            ],
+            "temperature": temperature,
+            "max_tokens": 3000,
+        }
+    else:
+        request = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                    ],
+                }
+            ],
+            "temperature": temperature,
+            "max_tokens": 3000,
+        }

    # Make request and get response using httpx
    url = f"http://{server}/v1/chat/completions"
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@ -138,8 +138,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
            {
                "role": "user",
                "content": [
-                    {"type": "text", "text": build_finetuning_prompt(anchor_text)},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                    {"type": "text", "text": build_finetuning_prompt(anchor_text)},
                ],
            }
        ],
@ -500,7 +500,7 @@ async def worker(args, work_queue: WorkQueue, semaphore, worker_id):
 async def sglang_server_task(model_name_or_path, args, semaphore):
    # Check GPU memory, lower mem devices need a bit less KV cache space because the VLM takes additional memory
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
-    mem_fraction_arg = ["--mem-fraction-static", "0.80"] if gpu_memory < 60 else []
+    mem_fraction_arg = ["--mem-fraction-static", "0.70"] if gpu_memory < 60 else []

    cmd = [
        "python3",
--- a/olmocr/prompts/prompts.py
+++ b/olmocr/prompts/prompts.py
@ -98,7 +98,7 @@ def openai_response_format_schema() -> dict:

 # This is a base prompt that will be used for training and running the fine tuned model
 # It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
-def build_finetuning_prompt_qwen2(base_text: str) -> str:
+def build_finetuning_prompt(base_text: str) -> str:
    return (
        f"Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. "
        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
@ -107,11 +107,11 @@ def build_finetuning_prompt_qwen2(base_text: str) -> str:
    )

 # This is the new fine tuning prompt we are trying for qwen2.5 vl
-def build_finetuning_prompt(base_text: str) -> str:
-    return (
-        f"Below is the image of one page of a document. "
-        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
-    )
+# def build_finetuning_prompt(base_text: str) -> str:
+#     return (
+#         f"Below is the image of one page of a document. "
+#         f"Just return the plain text representation of this document as if you were reading it naturally.\n"
+#     )


 # Extracts the anchor text component from an existing prompt string