diff --git a/olmocr/bench/miners/processing_old_scans.py b/olmocr/bench/miners/processing_old_scans.py index 3ced733..b0d99fa 100644 --- a/olmocr/bench/miners/processing_old_scans.py +++ b/olmocr/bench/miners/processing_old_scans.py @@ -2,35 +2,37 @@ import json import random import re + def extract_random_segment(text, min_words=7, max_words=15): """Extract a random segment of 7-15 words from the text.""" words = text.split() if len(words) <= max_words: return text # Return full text if it's shorter than max_words - + max_start = len(words) - min_words start = random.randint(0, max_start) remaining_words = len(words) - start segment_length = random.randint(min_words, min(max_words, remaining_words)) - segment = words[start:start + segment_length] - return ' '.join(segment) + segment = words[start : start + segment_length] + return " ".join(segment) + def process_jsonl_file_present(input_file, output_file): """Process a JSONL file and create multiple random cases for each PDF.""" - with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: + with open(input_file, "r") as infile, open(output_file, "w") as outfile: for line in infile: if line.strip(): # Skip empty lines data = json.loads(line) image = data["image"] original_text = data["text"] num_cases = random.randint(1, 3) - + for _ in range(num_cases): processed_num = random.randint(5, 10) processed_id = f"{image}_processed{processed_num:02d}" max_diffs = random.randint(1, 2) text_segment = extract_random_segment(original_text) - + new_case = { "pdf": f"{image}.pdf", "page": 1, @@ -40,15 +42,15 @@ def process_jsonl_file_present(input_file, output_file): "text": text_segment, "case_sensitive": True, "first_n": None, - "last_n": None + "last_n": None, } - outfile.write(json.dumps(new_case) + '\n') + outfile.write(json.dumps(new_case) + "\n") def extract_ordered_segments(text, min_words=7, max_words=15): """Extract two ordered segments from the text.""" - sentences = re.split(r'(?<=[.!?])\s+', text) - + sentences = re.split(r"(?<=[.!?])\s+", text) + if len(sentences) < 2: return None, None valid_indices = list(range(len(sentences))) @@ -62,33 +64,34 @@ def extract_ordered_segments(text, min_words=7, max_words=15): before_words = before_sentence.split() after_words = after_sentence.split() - + if len(before_words) > max_words: start = random.randint(0, len(before_words) - min_words) length = random.randint(min_words, min(max_words, len(before_words) - start)) - before_segment = ' '.join(before_words[start:start + length]) + before_segment = " ".join(before_words[start : start + length]) else: before_segment = before_sentence - + if len(after_words) > max_words: start = random.randint(0, len(after_words) - min_words) length = random.randint(min_words, min(max_words, len(after_words) - start)) - after_segment = ' '.join(after_words[start:start + length]) + after_segment = " ".join(after_words[start : start + length]) else: after_segment = after_sentence - + return before_segment, after_segment + def process_jsonl_file_order(input_file, output_file): """Process a JSONL file and create order-type cases.""" - with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: + with open(input_file, "r") as infile, open(output_file, "w") as outfile: for line in infile: if line.strip(): # Skip empty lines data = json.loads(line) image = data["image"] original_text = data["text"] num_cases = random.randint(1, 3) - + for _ in range(num_cases): before_text, after_text = extract_ordered_segments(original_text) if not before_text or not after_text: @@ -96,7 +99,7 @@ def process_jsonl_file_order(input_file, output_file): processed_num = random.randint(11, 16) processed_id = f"{image}_processed{processed_num:02d}" max_diffs = random.randint(1, 3) - + new_case = { "pdf": f"{image}.pdf", "page": 1, @@ -106,13 +109,14 @@ def process_jsonl_file_order(input_file, output_file): "after": after_text, "max_diffs": max_diffs, "checked": "verified", - "url": f"https://example.com/document/{image}" + "url": f"https://example.com/document/{image}", } - - outfile.write(json.dumps(new_case) + '\n') + + outfile.write(json.dumps(new_case) + "\n") + if __name__ == "__main__": input_file = "olmoce/bench/sample_data/old_scans.jsonl" output_file = "order_cases.jsonl" process_jsonl_file_present(input_file, output_file) - process_jsonl_file_order(input_file, output_file) \ No newline at end of file + process_jsonl_file_order(input_file, output_file) diff --git a/olmocr/bench/review_app_latex.py b/olmocr/bench/review_app_latex.py index 732ab5d..04fd57e 100644 --- a/olmocr/bench/review_app_latex.py +++ b/olmocr/bench/review_app_latex.py @@ -244,7 +244,7 @@ def create_templates_directory(): """Create templates directory for Flask if it doesn't exist.""" templates_dir = os.path.join(os.path.dirname(__file__), "templates") os.makedirs(templates_dir, exist_ok=True) - + # Create the review_latex.html template with MathJax support review_html = """ @@ -607,7 +607,7 @@ def create_templates_directory(): """ - + # Create the all_done_latex.html template all_done_html = """ @@ -663,11 +663,10 @@ def create_templates_directory(): """ - with open(os.path.join(templates_dir, "review_latex.html"), "w") as f: f.write(review_html) - + with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f: f.write(all_done_html) @@ -690,7 +689,6 @@ def main(): print(f"Error: Dataset not found: {args.dataset_file}") return 1 - DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file)) DATASET_FILE = args.dataset_file @@ -715,4 +713,4 @@ def main(): if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/olmocr/bench/runners/run_rolmocr.py b/olmocr/bench/runners/run_rolmocr.py index dd77563..71b7352 100644 --- a/olmocr/bench/runners/run_rolmocr.py +++ b/olmocr/bench/runners/run_rolmocr.py @@ -1,4 +1,3 @@ - import httpx from olmocr.data.renderpdf import render_pdf_to_base64png