From 8f5d5bdf28c1f7375e5f5b5b5ecab6343c2f97e1 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 21:59:23 +0000 Subject: [PATCH] Revert "Trying to add repetition penalty" This reverts commit 90f754e7b182f5978f60f5e4734f6ebb0aa3e735. --- olmocr/pipeline.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 3f33a63..a96db8e 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -106,7 +106,7 @@ class PageResult: async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int, image_rotation: int = 0) -> dict: - MAX_TOKENS = 4000 + MAX_TOKENS = 5000 assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query" # Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread) @@ -218,9 +218,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: MAX_RETRIES = args.max_page_retries TEMPERATURE_BY_ATTEMPT = [0.1, 0.1, 0.2, 0.3, 0.5, 0.8, 0.1, 0.8] FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT = [False, False, False, False, False, False, True, True] - REPETITION_PENALTY_BY_ATTEMPT = [None, None, None, None, None, 1.05, 1.05, 1.05] assert len(TEMPERATURE_BY_ATTEMPT) == len(FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT) - assert len(TEMPERATURE_BY_ATTEMPT) == len(REPETITION_PENALTY_BY_ATTEMPT) exponential_backoffs = 0 local_anchor_text_len = args.target_anchor_text_len local_image_rotation = 0 @@ -237,10 +235,6 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: # Change temperature as number of attempts increases to overcome repetition issues at expense of quality query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt] - # Add a small repetition penalty similar to other qwen models on later retries - if REPETITION_PENALTY_BY_ATTEMPT[lookup_attempt] is not None: - query["repetition_penalty"] = REPETITION_PENALTY_BY_ATTEMPT[lookup_attempt] - logger.info(f"Built page query for {pdf_orig_path}-{page_num}") try: