diff --git a/olmocr/prompts/__init__.py b/olmocr/prompts/__init__.py index f01c5b4..6eb5060 100644 --- a/olmocr/prompts/__init__.py +++ b/olmocr/prompts/__init__.py @@ -2,6 +2,7 @@ from .prompts import ( PageResponse, build_finetuning_prompt, build_no_anchoring_yaml_prompt, + build_no_anchoring_v4_yaml_prompt, build_openai_silver_data_prompt, extract_raw_text, openai_response_format_schema, diff --git a/olmocr/prompts/prompts.py b/olmocr/prompts/prompts.py index bc3703e..da37b39 100644 --- a/olmocr/prompts/prompts.py +++ b/olmocr/prompts/prompts.py @@ -48,7 +48,7 @@ def build_openai_silver_data_prompt_v3_simple(page_width: int, page_height: int) return ( f"Attached is the image of one page of a PDF document." f"Just return the plain text representation of this document as if you were reading it naturally.\n" - f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∪ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \( \) instead.\n" + f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∪ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \\( \\) instead.\n" f"Convert tables into HTML format. Keep the syntax simple, but use