diff --git a/olmocr/bench/runners/run_server.py b/olmocr/bench/runners/run_server.py index 4425368..d2646b5 100644 --- a/olmocr/bench/runners/run_server.py +++ b/olmocr/bench/runners/run_server.py @@ -22,6 +22,7 @@ async def run_server( target_longest_image_dim: int = 1024, prompt_template: Literal["full", "basic", "finetune"] = "finetune", response_template: Literal["plain", "json"] = "json", + prompt_image_first: bool = False, ) -> str: """ Convert page of a PDF file to markdown by calling a request @@ -48,20 +49,36 @@ async def run_server( else: raise ValueError("Unknown prompt template") - request = { - "model": model, - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, - ], - } - ], - "temperature": temperature, - "max_tokens": 3000, - } + if prompt_image_first: + request = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, + {"type": "text", "text": prompt}, + ], + } + ], + "temperature": temperature, + "max_tokens": 3000, + } + else: + request = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, + ], + } + ], + "temperature": temperature, + "max_tokens": 3000, + } # Make request and get response using httpx url = f"http://{server}/v1/chat/completions" diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index e14e6bb..a884458 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -138,8 +138,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ { "role": "user", "content": [ - {"type": "text", "text": build_finetuning_prompt(anchor_text)}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, + {"type": "text", "text": build_finetuning_prompt(anchor_text)}, ], } ], @@ -500,7 +500,7 @@ async def worker(args, work_queue: WorkQueue, semaphore, worker_id): async def sglang_server_task(model_name_or_path, args, semaphore): # Check GPU memory, lower mem devices need a bit less KV cache space because the VLM takes additional memory gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # Convert to GB - mem_fraction_arg = ["--mem-fraction-static", "0.80"] if gpu_memory < 60 else [] + mem_fraction_arg = ["--mem-fraction-static", "0.70"] if gpu_memory < 60 else [] cmd = [ "python3", diff --git a/olmocr/prompts/prompts.py b/olmocr/prompts/prompts.py index d5dff9b..84899a9 100644 --- a/olmocr/prompts/prompts.py +++ b/olmocr/prompts/prompts.py @@ -98,7 +98,7 @@ def openai_response_format_schema() -> dict: # This is a base prompt that will be used for training and running the fine tuned model # It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset -def build_finetuning_prompt_qwen2(base_text: str) -> str: +def build_finetuning_prompt(base_text: str) -> str: return ( f"Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. " f"Just return the plain text representation of this document as if you were reading it naturally.\n" @@ -107,11 +107,11 @@ def build_finetuning_prompt_qwen2(base_text: str) -> str: ) # This is the new fine tuning prompt we are trying for qwen2.5 vl -def build_finetuning_prompt(base_text: str) -> str: - return ( - f"Below is the image of one page of a document. " - f"Just return the plain text representation of this document as if you were reading it naturally.\n" - ) +# def build_finetuning_prompt(base_text: str) -> str: +# return ( +# f"Below is the image of one page of a document. " +# f"Just return the plain text representation of this document as if you were reading it naturally.\n" +# ) # Extracts the anchor text component from an existing prompt string