From 56e51ea23a54019b7df730b7648d00787807bf16 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 1 Jul 2025 20:35:57 +0000 Subject: [PATCH] Improving regex even more --- olmocr/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 95057dc..c6641da 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -238,7 +238,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: # Enable guided decoding regex if needed if args.guided_decoding: - query["guided_regex"] = r"---\nprimary_language: .{1,20}\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction: (?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n---\n[\s\S]*" + query["guided_regex"] = r"---\nprimary_language: (?:[a-z]{2}|null)\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction: (?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n---\n[\s\S]*" logger.info(f"Built page query for {pdf_orig_path}-{page_num}")