Building openai prompt with structured output

2025-11-01 10:33:57 +00:00 · 2024-10-02 18:10:47 +00:00 · 2024-10-02 18:10:47 +00:00 · 802632c49f
commit 802632c49f
parent be00ccf321
2 changed files with 79 additions and 2 deletions
--- a/pdelfin/prompts/prompts.py
+++ b/pdelfin/prompts/prompts.py
@ -15,6 +15,56 @@ def build_openai_silver_data_prompt(base_text: str) -> str:
    )


+def openai_response_format_schema() -> dict:
+    return {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "page_response",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "primary_language": {
+                        "type": ["string", "null"],
+                        "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
+                    },
+                    "is_rotation_valid": {
+                        "type": "boolean",
+                        "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
+                    },
+                    "rotation_correct": {
+                        "type": "integer",
+                        "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
+                        "enum": [0, 90, 180, 270],
+                        "default": 0,
+                    },
+                    "is_table": {
+                        "type": "boolean",
+                        "description": "Indicates if the majority of the page content is in tabular format.",
+                    },
+                    "is_diagram": {
+                        "type": "boolean",
+                        "description": "Indicates if the majority of the page content is a visual diagram.",
+                    },
+                    "natural_text": {
+                        "type": "string",
+                        "description": "The natural text content extracted from the page.",
+                    },
+                },
+                "additionalProperties": False,
+                "required": [
+                    "primary_language",
+                    "is_rotation_valid",
+                    "rotation_correct",
+                    "is_table",
+                    "is_diagram",
+                    "natural_text",
+                ],
+            },
+            "strict": True
+        },
+    }
+
+
 # This is a base prompt that will be used for training and running the fine tuned model
 # It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
 def build_finetuning_prompt(base_text: str) -> str:
@ -26,6 +76,7 @@ def build_finetuning_prompt(base_text: str) -> str:
    )


+# Extracts the anchor text component from an existing prompt string
 def extract_raw_text(prompt: str) -> str:
    pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"

--- a/pdelfin/silver_data/buildsilver.py
+++ b/pdelfin/silver_data/buildsilver.py
@ -12,7 +12,7 @@ from typing import Generator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import urlparse

-from pdelfin.prompts import build_openai_silver_data_prompt
+from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
 from pdelfin.prompts.anchor import get_anchor_text
 from pdelfin.filter import PdfFilter

@ -48,6 +48,29 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di

    anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

+    # DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
+    # from openai import OpenAI
+    # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    # response = client.chat.completions.create(
+    #     model="gpt-4o-2024-08-06",
+    #     messages= [
+    #             {
+    #                 "role": "user",
+    #                 "content": [
+    #                     {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
+    #                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
+    #                 ],
+    #             }
+    #         ],
+    #     temperature=0.1,
+    #     max_tokens=3000,
+    #     logprobs=True,
+    #     top_logprobs=5,
+    #     response_format=openai_response_format_schema()
+    # )
+    # print(response)
+
    # Construct OpenAI Batch API request format
    return {
        "custom_id": f"{pretty_pdf_path}-{page}",
@ -65,7 +88,10 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
                }
            ],
            "temperature": 0.1,
-            "max_tokens": 3000
+            "max_tokens": 3000,
+            "logprobs": True,
+            "top_logprobs": 5,
+            "response_format": openai_response_format_schema(),
        }
    }