mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-26 23:53:31 +00:00
Better regex for structured decoding, adding some new prompts to train with
This commit is contained in:
parent
a651cf0ca6
commit
e691ea176c
@ -238,7 +238,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
|
|||||||
|
|
||||||
# Enable guided decoding regex if needed
|
# Enable guided decoding regex if needed
|
||||||
if args.guided_decoding:
|
if args.guided_decoding:
|
||||||
query["guided_regex"] = r"---\nprimary_language: .*\nis_rotation_valid: .*\nrotation_correction: .*\nis_table: .*\nis_diagram: .*\n---\n[\s\S]*"
|
query["guided_regex"] = r"---\nprimary_language: .{1,20}\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction:(?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n---\n[\s\S]*"
|
||||||
|
|
||||||
logger.info(f"Built page query for {pdf_orig_path}-{page_num}")
|
logger.info(f"Built page query for {pdf_orig_path}-{page_num}")
|
||||||
|
|
||||||
|
|||||||
@ -48,6 +48,20 @@ class FinetuningPromptConfig(PipelineStepConfig):
|
|||||||
name: str = "FinetuningPrompt"
|
name: str = "FinetuningPrompt"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NewYamlFinetuningPromptWithAnchoringConfig(PipelineStepConfig):
|
||||||
|
"""Configuration for NewYamlFinetuningPromptWithAnchoring step."""
|
||||||
|
|
||||||
|
name: str = "NewYamlFinetuningPromptWithAnchoring"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NewYamlFinetuningPromptWithNoAnchoringConfig(PipelineStepConfig):
|
||||||
|
"""Configuration for NewYamlFinetuningPromptWithNoAnchoring step."""
|
||||||
|
|
||||||
|
name: str = "NewYamlFinetuningPromptWithNoAnchoring"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FrontMatterOutputFormatConfig(PipelineStepConfig):
|
class FrontMatterOutputFormatConfig(PipelineStepConfig):
|
||||||
"""Configuration for FrontMatterOutputFormat step."""
|
"""Configuration for FrontMatterOutputFormat step."""
|
||||||
@ -282,6 +296,8 @@ class Config:
|
|||||||
FrontMatterOutputFormat,
|
FrontMatterOutputFormat,
|
||||||
FrontMatterParser,
|
FrontMatterParser,
|
||||||
InstructUserMessages,
|
InstructUserMessages,
|
||||||
|
NewYamlFinetuningPromptWithAnchoring,
|
||||||
|
NewYamlFinetuningPromptWithNoAnchoring,
|
||||||
PDFRenderer,
|
PDFRenderer,
|
||||||
StaticLengthDocumentAnchoring,
|
StaticLengthDocumentAnchoring,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
@ -313,6 +329,12 @@ class Config:
|
|||||||
elif step_name == "FinetuningPrompt":
|
elif step_name == "FinetuningPrompt":
|
||||||
steps.append(FinetuningPrompt())
|
steps.append(FinetuningPrompt())
|
||||||
|
|
||||||
|
elif step_name == "NewYamlFinetuningPromptWithAnchoring":
|
||||||
|
steps.append(NewYamlFinetuningPromptWithAnchoring())
|
||||||
|
|
||||||
|
elif step_name == "NewYamlFinetuningPromptWithNoAnchoring":
|
||||||
|
steps.append(NewYamlFinetuningPromptWithNoAnchoring())
|
||||||
|
|
||||||
elif step_name == "FrontMatterOutputFormat":
|
elif step_name == "FrontMatterOutputFormat":
|
||||||
steps.append(FrontMatterOutputFormat())
|
steps.append(FrontMatterOutputFormat())
|
||||||
|
|
||||||
|
|||||||
@ -288,6 +288,33 @@ class FinetuningPrompt(PipelineStep):
|
|||||||
return sample
|
return sample
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class NewYamlFinetuningPromptWithAnchoring(PipelineStep):
|
||||||
|
"""Applies the standard fine tuning prompt"""
|
||||||
|
|
||||||
|
def __call__(self, sample: Sample) -> Sample:
|
||||||
|
sample["instruction_prompt"] = (
|
||||||
|
f"Attached is one page of a document, as well as some raw textual content that was previously extracted for it. "
|
||||||
|
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
|
||||||
|
f"RAW_TEXT_START\n{sample['anchor_text']}\nRAW_TEXT_END\n"
|
||||||
|
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
|
||||||
|
)
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class NewYamlFinetuningPromptWithNoAnchoring(PipelineStep):
|
||||||
|
"""Applies the standard fine tuning prompt"""
|
||||||
|
|
||||||
|
def __call__(self, sample: Sample) -> Sample:
|
||||||
|
sample["instruction_prompt"] = (
|
||||||
|
f"Attached is one page of a document that you must process. "
|
||||||
|
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
|
||||||
|
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
|
||||||
|
)
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, slots=True)
|
@dataclass(frozen=True, slots=True)
|
||||||
class FrontMatterOutputFormat(PipelineStep):
|
class FrontMatterOutputFormat(PipelineStep):
|
||||||
"""Takes the output and applies the standard yaml formatting to it"""
|
"""Takes the output and applies the standard yaml formatting to it"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user