From e691ea176cb51167defb5a076fcecef20c2b1b4d Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 1 Jul 2025 18:12:32 +0000 Subject: [PATCH] Better regex for structured decoding, adding some new prompts to train with --- olmocr/pipeline.py | 2 +- olmocr/train/config.py | 22 ++++++++++++++++++++++ olmocr/train/dataloader.py | 27 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index d936216..2b9f3b5 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -238,7 +238,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: # Enable guided decoding regex if needed if args.guided_decoding: - query["guided_regex"] = r"---\nprimary_language: .*\nis_rotation_valid: .*\nrotation_correction: .*\nis_table: .*\nis_diagram: .*\n---\n[\s\S]*" + query["guided_regex"] = r"---\nprimary_language: .{1,20}\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction:(?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n---\n[\s\S]*" logger.info(f"Built page query for {pdf_orig_path}-{page_num}") diff --git a/olmocr/train/config.py b/olmocr/train/config.py index 9a5a723..d750dcf 100644 --- a/olmocr/train/config.py +++ b/olmocr/train/config.py @@ -48,6 +48,20 @@ class FinetuningPromptConfig(PipelineStepConfig): name: str = "FinetuningPrompt" +@dataclass +class NewYamlFinetuningPromptWithAnchoringConfig(PipelineStepConfig): + """Configuration for NewYamlFinetuningPromptWithAnchoring step.""" + + name: str = "NewYamlFinetuningPromptWithAnchoring" + + +@dataclass +class NewYamlFinetuningPromptWithNoAnchoringConfig(PipelineStepConfig): + """Configuration for NewYamlFinetuningPromptWithNoAnchoring step.""" + + name: str = "NewYamlFinetuningPromptWithNoAnchoring" + + @dataclass class FrontMatterOutputFormatConfig(PipelineStepConfig): """Configuration for FrontMatterOutputFormat step.""" @@ -282,6 +296,8 @@ class Config: FrontMatterOutputFormat, FrontMatterParser, InstructUserMessages, + NewYamlFinetuningPromptWithAnchoring, + NewYamlFinetuningPromptWithNoAnchoring, PDFRenderer, StaticLengthDocumentAnchoring, Tokenizer, @@ -313,6 +329,12 @@ class Config: elif step_name == "FinetuningPrompt": steps.append(FinetuningPrompt()) + elif step_name == "NewYamlFinetuningPromptWithAnchoring": + steps.append(NewYamlFinetuningPromptWithAnchoring()) + + elif step_name == "NewYamlFinetuningPromptWithNoAnchoring": + steps.append(NewYamlFinetuningPromptWithNoAnchoring()) + elif step_name == "FrontMatterOutputFormat": steps.append(FrontMatterOutputFormat()) diff --git a/olmocr/train/dataloader.py b/olmocr/train/dataloader.py index 401b7a7..61ce6b0 100644 --- a/olmocr/train/dataloader.py +++ b/olmocr/train/dataloader.py @@ -286,6 +286,33 @@ class FinetuningPrompt(PipelineStep): def __call__(self, sample: Sample) -> Sample: sample["instruction_prompt"] = build_finetuning_prompt(sample["anchor_text"]) return sample + + +@dataclass(frozen=True, slots=True) +class NewYamlFinetuningPromptWithAnchoring(PipelineStep): + """Applies the standard fine tuning prompt""" + + def __call__(self, sample: Sample) -> Sample: + sample["instruction_prompt"] = ( + f"Attached is one page of a document, as well as some raw textual content that was previously extracted for it. " + f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n" + f"RAW_TEXT_START\n{sample['anchor_text']}\nRAW_TEXT_END\n" + f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters." + ) + return sample + + +@dataclass(frozen=True, slots=True) +class NewYamlFinetuningPromptWithNoAnchoring(PipelineStep): + """Applies the standard fine tuning prompt""" + + def __call__(self, sample: Sample) -> Sample: + sample["instruction_prompt"] = ( + f"Attached is one page of a document that you must process. " + f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n" + f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters." + ) + return sample @dataclass(frozen=True, slots=True)