This commit is contained in:
Jake Poznanski 2025-08-19 17:50:23 +00:00
parent 798335c88e
commit cd09e190b5
2 changed files with 2 additions and 1 deletions

View File

@ -2,6 +2,7 @@ from .prompts import (
PageResponse,
build_finetuning_prompt,
build_no_anchoring_yaml_prompt,
build_no_anchoring_v4_yaml_prompt,
build_openai_silver_data_prompt,
extract_raw_text,
openai_response_format_schema,

View File

@ -48,7 +48,7 @@ def build_openai_silver_data_prompt_v3_simple(page_width: int, page_height: int)
return (
f"Attached is the image of one page of a PDF document."
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \( \) instead.\n"
f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \\( \\) instead.\n"
f"Convert tables into HTML format. Keep the syntax simple, but use <th> for header rows, and use rowspan and colspans appropriately. Don't use <br> inside of table cells, just split that into new rows as needed. Do NOT use LaTeX or Markdown table syntax.\n"
f"Remove the headers and footers, but keep references and footnotes.\n"
f"Read any natural handwriting.\n"