mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-25 16:30:28 +00:00
Prompt utils
This commit is contained in:
parent
b6543a4f65
commit
f99f6a6729
@ -1,3 +1,5 @@
|
||||
import re
|
||||
|
||||
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
|
||||
def build_openai_silver_data_prompt(base_text: str) -> str:
|
||||
return (
|
||||
@ -21,4 +23,15 @@ def build_finetuning_prompt(base_text: str) -> str:
|
||||
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
||||
f"Do not hallucinate.\n"
|
||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||
)
|
||||
)
|
||||
|
||||
def extract_raw_text(prompt: str) -> str:
|
||||
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
||||
|
||||
# Use re.DOTALL to ensure that the dot matches newline characters
|
||||
match = re.search(pattern, prompt, re.DOTALL)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
else
|
||||
raise ValueError("Prompt does not contain raw text")
|
||||
|
Loading…
x
Reference in New Issue
Block a user