mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-14 09:42:47 +00:00
Fixes
This commit is contained in:
parent
798335c88e
commit
cd09e190b5
@ -2,6 +2,7 @@ from .prompts import (
|
||||
PageResponse,
|
||||
build_finetuning_prompt,
|
||||
build_no_anchoring_yaml_prompt,
|
||||
build_no_anchoring_v4_yaml_prompt,
|
||||
build_openai_silver_data_prompt,
|
||||
extract_raw_text,
|
||||
openai_response_format_schema,
|
||||
|
@ -48,7 +48,7 @@ def build_openai_silver_data_prompt_v3_simple(page_width: int, page_height: int)
|
||||
return (
|
||||
f"Attached is the image of one page of a PDF document."
|
||||
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
||||
f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∪ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \( \) instead.\n"
|
||||
f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∪ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \\( \\in \\) \\( \\notin \\) etc. If you were going to surround a math expression in $ symbols, surround it with \\( \\) instead.\n"
|
||||
f"Convert tables into HTML format. Keep the syntax simple, but use <th> for header rows, and use rowspan and colspans appropriately. Don't use <br> inside of table cells, just split that into new rows as needed. Do NOT use LaTeX or Markdown table syntax.\n"
|
||||
f"Remove the headers and footers, but keep references and footnotes.\n"
|
||||
f"Read any natural handwriting.\n"
|
||||
|
Loading…
x
Reference in New Issue
Block a user