diff --git a/pdelfin/prompts/prompts.py b/pdelfin/prompts/prompts.py index 1c91858..3cd8263 100644 --- a/pdelfin/prompts/prompts.py +++ b/pdelfin/prompts/prompts.py @@ -15,6 +15,56 @@ def build_openai_silver_data_prompt(base_text: str) -> str: ) +def openai_response_format_schema() -> dict: + return { + "type": "json_schema", + "json_schema": { + "name": "page_response", + "schema": { + "type": "object", + "properties": { + "primary_language": { + "type": ["string", "null"], + "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.", + }, + "is_rotation_valid": { + "type": "boolean", + "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.", + }, + "rotation_correct": { + "type": "integer", + "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.", + "enum": [0, 90, 180, 270], + "default": 0, + }, + "is_table": { + "type": "boolean", + "description": "Indicates if the majority of the page content is in tabular format.", + }, + "is_diagram": { + "type": "boolean", + "description": "Indicates if the majority of the page content is a visual diagram.", + }, + "natural_text": { + "type": "string", + "description": "The natural text content extracted from the page.", + }, + }, + "additionalProperties": False, + "required": [ + "primary_language", + "is_rotation_valid", + "rotation_correct", + "is_table", + "is_diagram", + "natural_text", + ], + }, + "strict": True + }, + } + + # This is a base prompt that will be used for training and running the fine tuned model # It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset def build_finetuning_prompt(base_text: str) -> str: @@ -26,6 +76,7 @@ def build_finetuning_prompt(base_text: str) -> str: ) +# Extracts the anchor text component from an existing prompt string def extract_raw_text(prompt: str) -> str: pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END" diff --git a/pdelfin/silver_data/buildsilver.py b/pdelfin/silver_data/buildsilver.py index d3eeab4..fb00f63 100644 --- a/pdelfin/silver_data/buildsilver.py +++ b/pdelfin/silver_data/buildsilver.py @@ -12,7 +12,7 @@ from typing import Generator from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse -from pdelfin.prompts import build_openai_silver_data_prompt +from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema from pdelfin.prompts.anchor import get_anchor_text from pdelfin.filter import PdfFilter @@ -48,6 +48,29 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") + # DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit + # from openai import OpenAI + # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + # response = client.chat.completions.create( + # model="gpt-4o-2024-08-06", + # messages= [ + # { + # "role": "user", + # "content": [ + # {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + # {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} + # ], + # } + # ], + # temperature=0.1, + # max_tokens=3000, + # logprobs=True, + # top_logprobs=5, + # response_format=openai_response_format_schema() + # ) + # print(response) + # Construct OpenAI Batch API request format return { "custom_id": f"{pretty_pdf_path}-{page}", @@ -65,7 +88,10 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di } ], "temperature": 0.1, - "max_tokens": 3000 + "max_tokens": 3000, + "logprobs": True, + "top_logprobs": 5, + "response_format": openai_response_format_schema(), } }