mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-09 15:09:40 +00:00
Building openai prompt with structured output
This commit is contained in:
parent
be00ccf321
commit
802632c49f
@ -15,6 +15,56 @@ def build_openai_silver_data_prompt(base_text: str) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def openai_response_format_schema() -> dict:
|
||||||
|
return {
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "page_response",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"primary_language": {
|
||||||
|
"type": ["string", "null"],
|
||||||
|
"description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
|
||||||
|
},
|
||||||
|
"is_rotation_valid": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
|
||||||
|
},
|
||||||
|
"rotation_correct": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
|
||||||
|
"enum": [0, 90, 180, 270],
|
||||||
|
"default": 0,
|
||||||
|
},
|
||||||
|
"is_table": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Indicates if the majority of the page content is in tabular format.",
|
||||||
|
},
|
||||||
|
"is_diagram": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Indicates if the majority of the page content is a visual diagram.",
|
||||||
|
},
|
||||||
|
"natural_text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The natural text content extracted from the page.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"additionalProperties": False,
|
||||||
|
"required": [
|
||||||
|
"primary_language",
|
||||||
|
"is_rotation_valid",
|
||||||
|
"rotation_correct",
|
||||||
|
"is_table",
|
||||||
|
"is_diagram",
|
||||||
|
"natural_text",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"strict": True
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# This is a base prompt that will be used for training and running the fine tuned model
|
# This is a base prompt that will be used for training and running the fine tuned model
|
||||||
# It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
|
# It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
|
||||||
def build_finetuning_prompt(base_text: str) -> str:
|
def build_finetuning_prompt(base_text: str) -> str:
|
||||||
@ -26,6 +76,7 @@ def build_finetuning_prompt(base_text: str) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Extracts the anchor text component from an existing prompt string
|
||||||
def extract_raw_text(prompt: str) -> str:
|
def extract_raw_text(prompt: str) -> str:
|
||||||
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from typing import Generator
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from pdelfin.prompts import build_openai_silver_data_prompt
|
from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema
|
||||||
from pdelfin.prompts.anchor import get_anchor_text
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
from pdelfin.filter import PdfFilter
|
from pdelfin.filter import PdfFilter
|
||||||
|
|
||||||
@ -48,6 +48,29 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
|
|
||||||
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||||
|
|
||||||
|
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
|
||||||
|
# from openai import OpenAI
|
||||||
|
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
# response = client.chat.completions.create(
|
||||||
|
# model="gpt-4o-2024-08-06",
|
||||||
|
# messages= [
|
||||||
|
# {
|
||||||
|
# "role": "user",
|
||||||
|
# "content": [
|
||||||
|
# {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
||||||
|
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
||||||
|
# ],
|
||||||
|
# }
|
||||||
|
# ],
|
||||||
|
# temperature=0.1,
|
||||||
|
# max_tokens=3000,
|
||||||
|
# logprobs=True,
|
||||||
|
# top_logprobs=5,
|
||||||
|
# response_format=openai_response_format_schema()
|
||||||
|
# )
|
||||||
|
# print(response)
|
||||||
|
|
||||||
# Construct OpenAI Batch API request format
|
# Construct OpenAI Batch API request format
|
||||||
return {
|
return {
|
||||||
"custom_id": f"{pretty_pdf_path}-{page}",
|
"custom_id": f"{pretty_pdf_path}-{page}",
|
||||||
@ -65,7 +88,10 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"temperature": 0.1,
|
"temperature": 0.1,
|
||||||
"max_tokens": 3000
|
"max_tokens": 3000,
|
||||||
|
"logprobs": True,
|
||||||
|
"top_logprobs": 5,
|
||||||
|
"response_format": openai_response_format_schema(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user