mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-26 15:44:17 +00:00 
			
		
		
		
	Better regex for structured decoding, adding some new prompts to train with
This commit is contained in:
		
							parent
							
								
									a651cf0ca6
								
							
						
					
					
						commit
						e691ea176c
					
				| @ -238,7 +238,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: | ||||
| 
 | ||||
|         # Enable guided decoding regex if needed | ||||
|         if args.guided_decoding: | ||||
|             query["guided_regex"] = r"---\nprimary_language: .*\nis_rotation_valid: .*\nrotation_correction: .*\nis_table: .*\nis_diagram: .*\n---\n[\s\S]*" | ||||
|             query["guided_regex"] = r"---\nprimary_language: .{1,20}\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction:(?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n---\n[\s\S]*" | ||||
| 
 | ||||
|         logger.info(f"Built page query for {pdf_orig_path}-{page_num}") | ||||
| 
 | ||||
|  | ||||
| @ -48,6 +48,20 @@ class FinetuningPromptConfig(PipelineStepConfig): | ||||
|     name: str = "FinetuningPrompt" | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class NewYamlFinetuningPromptWithAnchoringConfig(PipelineStepConfig): | ||||
|     """Configuration for NewYamlFinetuningPromptWithAnchoring step.""" | ||||
| 
 | ||||
|     name: str = "NewYamlFinetuningPromptWithAnchoring" | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class NewYamlFinetuningPromptWithNoAnchoringConfig(PipelineStepConfig): | ||||
|     """Configuration for NewYamlFinetuningPromptWithNoAnchoring step.""" | ||||
| 
 | ||||
|     name: str = "NewYamlFinetuningPromptWithNoAnchoring" | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class FrontMatterOutputFormatConfig(PipelineStepConfig): | ||||
|     """Configuration for FrontMatterOutputFormat step.""" | ||||
| @ -282,6 +296,8 @@ class Config: | ||||
|             FrontMatterOutputFormat, | ||||
|             FrontMatterParser, | ||||
|             InstructUserMessages, | ||||
|             NewYamlFinetuningPromptWithAnchoring, | ||||
|             NewYamlFinetuningPromptWithNoAnchoring, | ||||
|             PDFRenderer, | ||||
|             StaticLengthDocumentAnchoring, | ||||
|             Tokenizer, | ||||
| @ -313,6 +329,12 @@ class Config: | ||||
|             elif step_name == "FinetuningPrompt": | ||||
|                 steps.append(FinetuningPrompt()) | ||||
| 
 | ||||
|             elif step_name == "NewYamlFinetuningPromptWithAnchoring": | ||||
|                 steps.append(NewYamlFinetuningPromptWithAnchoring()) | ||||
| 
 | ||||
|             elif step_name == "NewYamlFinetuningPromptWithNoAnchoring": | ||||
|                 steps.append(NewYamlFinetuningPromptWithNoAnchoring()) | ||||
| 
 | ||||
|             elif step_name == "FrontMatterOutputFormat": | ||||
|                 steps.append(FrontMatterOutputFormat()) | ||||
| 
 | ||||
|  | ||||
| @ -286,6 +286,33 @@ class FinetuningPrompt(PipelineStep): | ||||
|     def __call__(self, sample: Sample) -> Sample: | ||||
|         sample["instruction_prompt"] = build_finetuning_prompt(sample["anchor_text"]) | ||||
|         return sample | ||||
|      | ||||
| 
 | ||||
| @dataclass(frozen=True, slots=True) | ||||
| class NewYamlFinetuningPromptWithAnchoring(PipelineStep): | ||||
|     """Applies the standard fine tuning prompt""" | ||||
| 
 | ||||
|     def __call__(self, sample: Sample) -> Sample: | ||||
|         sample["instruction_prompt"] = ( | ||||
|             f"Attached is one page of a document, as well as some raw textual content that was previously extracted for it. " | ||||
|             f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n" | ||||
|             f"RAW_TEXT_START\n{sample['anchor_text']}\nRAW_TEXT_END\n" | ||||
|             f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters." | ||||
|         ) | ||||
|         return sample | ||||
| 
 | ||||
| 
 | ||||
| @dataclass(frozen=True, slots=True) | ||||
| class NewYamlFinetuningPromptWithNoAnchoring(PipelineStep): | ||||
|     """Applies the standard fine tuning prompt""" | ||||
| 
 | ||||
|     def __call__(self, sample: Sample) -> Sample: | ||||
|         sample["instruction_prompt"] = ( | ||||
|             f"Attached is one page of a document that you must process. " | ||||
|             f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n" | ||||
|             f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters." | ||||
|         ) | ||||
|         return sample | ||||
| 
 | ||||
| 
 | ||||
| @dataclass(frozen=True, slots=True) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jake Poznanski
						Jake Poznanski