mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-27 09:27:55 +00:00
Prompt utils
This commit is contained in:
parent
b6543a4f65
commit
f99f6a6729
@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
|
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
|
||||||
def build_openai_silver_data_prompt(base_text: str) -> str:
|
def build_openai_silver_data_prompt(base_text: str) -> str:
|
||||||
return (
|
return (
|
||||||
@ -22,3 +24,14 @@ def build_finetuning_prompt(base_text: str) -> str:
|
|||||||
f"Do not hallucinate.\n"
|
f"Do not hallucinate.\n"
|
||||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def extract_raw_text(prompt: str) -> str:
|
||||||
|
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"
|
||||||
|
|
||||||
|
# Use re.DOTALL to ensure that the dot matches newline characters
|
||||||
|
match = re.search(pattern, prompt, re.DOTALL)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
return match.group(1).strip()
|
||||||
|
else
|
||||||
|
raise ValueError("Prompt does not contain raw text")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user